From 5cf9a16d9735d639e029f11a756ebc5e0ad0e519 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 9 Apr 2026 16:35:14 -0300 Subject: [PATCH] monitoring: align overview panels with jobs and point-based suite rates --- scripts/dashboards_render_atlas.py | 141 ++++++++----- .../monitoring/dashboards/atlas-jobs.json | 33 +-- .../monitoring/dashboards/atlas-overview.json | 199 +++++++++++------- .../monitoring/dashboards/atlas-power.json | 8 +- .../monitoring/grafana-dashboard-jobs.yaml | 33 +-- .../grafana-dashboard-overview.yaml | 199 +++++++++++------- .../monitoring/grafana-dashboard-power.yaml | 8 +- 7 files changed, 393 insertions(+), 228 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 6ee85e20..0cd00f52 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -438,7 +438,8 @@ TEST_FAILURES_24H_TOTAL = ( '(sum(increase(ariadne_task_runs_total{status!="ok"}[24h])) or on() vector(0)) + ' '(sum(increase(metis_builds_total{status="error"}[24h])) or on() vector(0)) + ' '(sum(increase(metis_flashes_total{status="error"}[24h])) or on() vector(0)) + ' - '(sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="failed"}[24h])) or on() vector(0))' + '(sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="failed"}[24h])) or on() vector(0)) + ' + '(sum(increase(platform_quality_gate_runs_total{status!~"ok|passed|success"}[24h])) or on() vector(0))' ) PLATFORM_TEST_ACTIVITY_30D = ( 'label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), "source", "ariadne", "__name__", ".*") ' @@ -446,32 +447,35 @@ PLATFORM_TEST_ACTIVITY_30D = ( 'or label_replace(sum by (status) (increase(metis_flashes_total[30d])), "source", "metis-flash", "__name__", ".*") ' 'or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite="ananke"}[30d])), "source", "ananke-quality", "__name__", ".*")' ) -PLATFORM_TEST_ROLLING_WINDOW = "30d" -ARIADNE_SUITE_OK_INTERVAL = f'sum(increase(ariadne_task_runs_total{{status="ok"}}[{PLATFORM_TEST_ROLLING_WINDOW}]))' -ARIADNE_SUITE_TOTAL_INTERVAL = f'sum(increase(ariadne_task_runs_total[{PLATFORM_TEST_ROLLING_WINDOW}]))' +PLATFORM_TEST_POINT_WINDOW = "$__interval" +ARIADNE_SUITE_OK_INTERVAL = f'sum(increase(ariadne_task_runs_total{{status="ok"}}[{PLATFORM_TEST_POINT_WINDOW}]))' +ARIADNE_SUITE_TOTAL_INTERVAL = f'sum(increase(ariadne_task_runs_total[{PLATFORM_TEST_POINT_WINDOW}]))' METIS_SUITE_OK_INTERVAL = ( - f'(sum(increase(metis_builds_total{{status="ok"}}[{PLATFORM_TEST_ROLLING_WINDOW}])) + ' - f'sum(increase(metis_flashes_total{{status="ok"}}[{PLATFORM_TEST_ROLLING_WINDOW}])))' + f'(sum(increase(metis_builds_total{{status="ok"}}[{PLATFORM_TEST_POINT_WINDOW}])) + ' + f'sum(increase(metis_flashes_total{{status="ok"}}[{PLATFORM_TEST_POINT_WINDOW}])))' ) METIS_SUITE_TOTAL_INTERVAL = ( - f'(sum(increase(metis_builds_total[{PLATFORM_TEST_ROLLING_WINDOW}])) + ' - f'sum(increase(metis_flashes_total[{PLATFORM_TEST_ROLLING_WINDOW}])))' + f'(sum(increase(metis_builds_total[{PLATFORM_TEST_POINT_WINDOW}])) + ' + f'sum(increase(metis_flashes_total[{PLATFORM_TEST_POINT_WINDOW}])))' ) ANANKE_SUITE_OK_INTERVAL = ( - f'sum(increase(ananke_quality_gate_runs_total{{suite="ananke",status="ok"}}[{PLATFORM_TEST_ROLLING_WINDOW}]))' + f'sum(increase(ananke_quality_gate_runs_total{{suite="ananke",status="ok"}}[{PLATFORM_TEST_POINT_WINDOW}]))' ) ANANKE_SUITE_TOTAL_INTERVAL = ( - f'sum(increase(ananke_quality_gate_runs_total{{suite="ananke"}}[{PLATFORM_TEST_ROLLING_WINDOW}]))' + f'sum(increase(ananke_quality_gate_runs_total{{suite="ananke"}}[{PLATFORM_TEST_POINT_WINDOW}]))' ) PLATFORM_TEST_SUCCESS_RATE_ARIADNE_SERIES = ( - f'100 * ({ARIADNE_SUITE_OK_INTERVAL}) / clamp_min(({ARIADNE_SUITE_TOTAL_INTERVAL}), 1)' + f'(100 * ({ARIADNE_SUITE_OK_INTERVAL}) / clamp_min(({ARIADNE_SUITE_TOTAL_INTERVAL}), 1)) ' + f'and on() (({ARIADNE_SUITE_TOTAL_INTERVAL}) > 0)' ) PLATFORM_TEST_SUCCESS_RATE_METIS_SERIES = ( - f'100 * ({METIS_SUITE_OK_INTERVAL}) / clamp_min(({METIS_SUITE_TOTAL_INTERVAL}), 1)' + f'(100 * ({METIS_SUITE_OK_INTERVAL}) / clamp_min(({METIS_SUITE_TOTAL_INTERVAL}), 1)) ' + f'and on() (({METIS_SUITE_TOTAL_INTERVAL}) > 0)' ) PLATFORM_TEST_SUCCESS_RATE_ANANKE_SERIES = ( - f'100 * ({ANANKE_SUITE_OK_INTERVAL}) / clamp_min(({ANANKE_SUITE_TOTAL_INTERVAL}), 1)' + f'(100 * ({ANANKE_SUITE_OK_INTERVAL}) / clamp_min(({ANANKE_SUITE_TOTAL_INTERVAL}), 1)) ' + f'and on() (({ANANKE_SUITE_TOTAL_INTERVAL}) > 0)' ) PLATFORM_TEST_GENERIC_SUITE_NAMES = [ @@ -487,9 +491,10 @@ PLATFORM_TEST_GENERIC_SUITE_TARGETS = [ { "refId": chr(ord("D") + index), "expr": ( - f'100 * (sum(increase(platform_quality_gate_runs_total{{suite="{suite}",status=~"ok|passed|success"}}' - f'[{PLATFORM_TEST_ROLLING_WINDOW}]))) / ' - f'clamp_min((sum(increase(platform_quality_gate_runs_total{{suite="{suite}"}}[{PLATFORM_TEST_ROLLING_WINDOW}]))), 1)' + f'(100 * (sum(increase(platform_quality_gate_runs_total{{suite="{suite}",status=~"ok|passed|success"}}' + f'[{PLATFORM_TEST_POINT_WINDOW}]))) / ' + f'clamp_min((sum(increase(platform_quality_gate_runs_total{{suite="{suite}"}}[{PLATFORM_TEST_POINT_WINDOW}]))), 1)) ' + f'and on() ((sum(increase(platform_quality_gate_runs_total{{suite="{suite}"}}[{PLATFORM_TEST_POINT_WINDOW}]))) > 0)' ), "legendFormat": suite, } @@ -1341,10 +1346,10 @@ def build_overview(): text_mode="name_and_value", targets=[ {"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Draw (W)", "instant": True}, - {"refId": "B", "expr": ANANKE_UPS_RUNTIME_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Discharge ETA", "instant": True}, + {"refId": "B", "expr": ANANKE_UPS_RUNTIME_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Discharge", "instant": True}, {"refId": "C", "expr": ANANKE_UPS_ON_BATTERY_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Status", "instant": True}, {"refId": "D", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Draw (W)", "instant": True}, - {"refId": "E", "expr": ANANKE_UPS_RUNTIME_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Discharge ETA", "instant": True}, + {"refId": "E", "expr": ANANKE_UPS_RUNTIME_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Discharge", "instant": True}, {"refId": "F", "expr": ANANKE_UPS_ON_BATTERY_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Status", "instant": True}, ], field_overrides=[ @@ -1357,11 +1362,11 @@ def build_overview(): "properties": [{"id": "unit", "value": "watt"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}], }, { - "matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Discharge ETA"}, + "matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Discharge"}, "properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}], }, { - "matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Discharge ETA"}, + "matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Discharge"}, "properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}], }, { @@ -1374,7 +1379,7 @@ def build_overview(): }, ], links=link_to("atlas-power"), - description="Per-UPS live snapshot: current draw, discharge ETA, and charging/discharging status.", + description="Per-UPS live snapshot: current draw, discharge, and charging/discharging status.", ) ) panels.append( @@ -1491,31 +1496,54 @@ def build_overview(): ) panels.append( - table_panel( + bargauge_panel( 44, - "One-off Job Pods >1h", - f"({ONEOFF_JOB_POD_AGE_HOURS}) > 1", + "One-off Job Pods (age hours)", + ONEOFF_JOB_POD_AGE_HOURS, {"h": 5, "w": 6, "x": 0, "y": 7}, unit="h", instant=True, - transformations=[ - {"id": "labelsToFields", "options": {}}, - {"id": "organize", "options": {"excludeByName": {"Time": True}}}, - {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}, - ], - options={"showHeader": True, "cellHeight": "sm"}, + legend="{{namespace}}/{{pod}}", + thresholds=age_thresholds, + limit=12, + decimals=2, + links=link_to("atlas-jobs"), ) ) panels.append( - stat_panel( - 45, - "Ariadne Attempts (24h)", - "sum(increase(ariadne_task_runs_total[24h]))", - {"h": 5, "w": 6, "x": 6, "y": 7}, - unit="none", - decimals=0, - links=link_to("atlas-jobs"), - ) + { + "id": 45, + "type": "timeseries", + "title": "Ariadne Attempts / Failures", + "datasource": PROM_DS, + "gridPos": {"h": 5, "w": 6, "x": 6, "y": 7}, + "targets": [ + {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"}, + ], + "fieldConfig": { + "defaults": {"unit": "none"}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Attempts"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}} + ], + }, + { + "matcher": {"id": "byName", "options": "Failures"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}} + ], + }, + ], + }, + "options": { + "legend": {"displayMode": "table", "placement": "right"}, + "tooltip": {"mode": "multi"}, + }, + "links": link_to("atlas-jobs"), + } ) test_success = timeseries_panel( 46, @@ -1531,8 +1559,18 @@ def build_overview(): ) test_success["fieldConfig"]["defaults"]["min"] = 0 test_success["fieldConfig"]["defaults"]["max"] = 100 + test_success["fieldConfig"]["defaults"]["custom"] = { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "fillOpacity": 10, + "showPoints": "always", + "pointSize": 4, + "spanNulls": True, + } + test_success["timeFrom"] = "30d" test_success["description"] = ( - "Application-level rolling pass rate (0-100) over the last 30 days. Includes Ariadne/Metis/Ananke and auto-picks additional suite lines when platform_quality_gate_runs_total is emitted." + "Per-run interval pass points (0-100) for each software suite over the last 30 days. Points are connected to show trend; missing-run intervals are ignored." ) panels.append(test_success) test_failures = stat_panel( @@ -1546,9 +1584,7 @@ def build_overview(): thresholds=count_thresholds, links=link_to("atlas-jobs"), ) - test_failures["description"] = ( - "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query." - ) + test_failures["description"] = "Total failed test events in the last 24h across Ariadne, Metis, Ananke, and any suites publishing platform_quality_gate_runs_total." panels.append(test_failures) panels.append( @@ -3010,8 +3046,17 @@ def build_jobs_dashboard(): ) suite_panel["fieldConfig"]["defaults"]["min"] = 0 suite_panel["fieldConfig"]["defaults"]["max"] = 100 + suite_panel["fieldConfig"]["defaults"]["custom"] = { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "fillOpacity": 10, + "showPoints": "always", + "pointSize": 4, + "spanNulls": True, + } suite_panel["description"] = ( - "Application-level rolling pass percentage over the last 30 days. Existing suites: ariadne, metis, ananke; additional suites appear automatically when platform_quality_gate_runs_total is published." + "Per-run interval pass points (0-100) per suite. Existing suites: ariadne, metis, ananke; additional suites appear automatically when platform_quality_gate_runs_total is published." ) panels.append(suite_panel) @@ -3052,10 +3097,10 @@ def build_power_dashboard(): text_mode="name_and_value", targets=[ {"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Draw (W)", "instant": True}, - {"refId": "B", "expr": ANANKE_UPS_RUNTIME_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Discharge ETA", "instant": True}, + {"refId": "B", "expr": ANANKE_UPS_RUNTIME_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Discharge", "instant": True}, {"refId": "C", "expr": ANANKE_UPS_ON_BATTERY_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Status", "instant": True}, {"refId": "D", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Draw (W)", "instant": True}, - {"refId": "E", "expr": ANANKE_UPS_RUNTIME_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Discharge ETA", "instant": True}, + {"refId": "E", "expr": ANANKE_UPS_RUNTIME_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Discharge", "instant": True}, {"refId": "F", "expr": ANANKE_UPS_ON_BATTERY_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Status", "instant": True}, ], field_overrides=[ @@ -3068,11 +3113,11 @@ def build_power_dashboard(): "properties": [{"id": "unit", "value": "watt"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}], }, { - "matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Discharge ETA"}, + "matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Discharge"}, "properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}], }, { - "matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Discharge ETA"}, + "matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Discharge"}, "properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}], }, { diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index aded6768..ea2557da 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -1253,52 +1253,52 @@ "targets": [ { "refId": "A", - "expr": "100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d]))) / clamp_min((sum(increase(ariadne_task_runs_total[30d]))), 1)", + "expr": "(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval]))) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval]))), 1)) and on() ((sum(increase(ariadne_task_runs_total[$__interval]))) > 0)", "legendFormat": "ariadne" }, { "refId": "B", - "expr": "100 * ((sum(increase(metis_builds_total{status=\"ok\"}[30d])) + sum(increase(metis_flashes_total{status=\"ok\"}[30d])))) / clamp_min(((sum(increase(metis_builds_total[30d])) + sum(increase(metis_flashes_total[30d])))), 1)", + "expr": "(100 * ((sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) + sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])))) / clamp_min(((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval])))), 1)) and on() (((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval])))) > 0)", "legendFormat": "metis" }, { "refId": "C", - "expr": "100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[30d]))) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval]))) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval]))), 1)) and on() ((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval]))) > 0)", "legendFormat": "ananke" }, { "refId": "D", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[$__interval]))) > 0)", "legendFormat": "atlasbot" }, { "refId": "E", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[$__interval]))) > 0)", "legendFormat": "lesavka" }, { "refId": "F", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[$__interval]))) > 0)", "legendFormat": "pegasus" }, { "refId": "G", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[$__interval]))) > 0)", "legendFormat": "soteria" }, { "refId": "H", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[$__interval]))) > 0)", "legendFormat": "titan-iac" }, { "refId": "I", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[$__interval]))) > 0)", "legendFormat": "bstein-home" }, { "refId": "J", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[$__interval]))) > 0)", "legendFormat": "arcanagon" } ], @@ -1306,7 +1306,16 @@ "defaults": { "unit": "percent", "min": 0, - "max": 100 + "max": 100, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "fillOpacity": 10, + "showPoints": "always", + "pointSize": 4, + "spanNulls": true + } }, "overrides": [] }, @@ -1319,7 +1328,7 @@ "mode": "multi" } }, - "description": "Application-level rolling pass percentage over the last 30 days. Existing suites: ariadne, metis, ananke; additional suites appear automatically when platform_quality_gate_runs_total is published." + "description": "Per-run interval pass points (0-100) per suite. Existing suites: ariadne, metis, ananke; additional suites appear automatically when platform_quality_gate_runs_total is published." } ], "time": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 85e212fa..136104d9 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1094,7 +1094,7 @@ { "refId": "B", "expr": "max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Pyrphoros\"}) or on() vector(0)", - "legendFormat": "Pyrphoros Discharge ETA", + "legendFormat": "Pyrphoros Discharge", "instant": true }, { @@ -1112,7 +1112,7 @@ { "refId": "E", "expr": "max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Statera\"}) or on() vector(0)", - "legendFormat": "Statera Discharge ETA", + "legendFormat": "Statera Discharge", "instant": true }, { @@ -1183,7 +1183,7 @@ { "matcher": { "id": "byName", - "options": "Pyrphoros Discharge ETA" + "options": "Pyrphoros Discharge" }, "properties": [ { @@ -1199,7 +1199,7 @@ { "matcher": { "id": "byName", - "options": "Statera Discharge ETA" + "options": "Statera Discharge" }, "properties": [ { @@ -1290,7 +1290,7 @@ "targetBlank": true } ], - "description": "Per-UPS live snapshot: current draw, discharge ETA, and charging/discharging status." + "description": "Per-UPS live snapshot: current draw, discharge, and charging/discharging status." }, { "id": 41, @@ -1678,8 +1678,8 @@ }, { "id": 44, - "type": "table", - "title": "One-off Job Pods >1h", + "type": "bargauge", + "title": "One-off Job Pods (age hours)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1692,38 +1692,61 @@ }, "targets": [ { - "expr": "(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})) > 1", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "h", - "custom": { - "filterable": true - } + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false, - "cellHeight": "sm" + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } }, + "links": [ + { + "title": "Open atlas-jobs dashboard", + "url": "/d/atlas-jobs", + "targetBlank": true + } + ], "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true - } - } - }, { "id": "sortBy", "options": { @@ -1732,13 +1755,19 @@ ], "order": "desc" } + }, + { + "id": "limit", + "options": { + "limit": 12 + } } ] }, { "id": 45, - "type": "stat", - "title": "Ariadne Attempts (24h)", + "type": "timeseries", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1751,49 +1780,61 @@ }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[24h]))", - "refId": "A" + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "B", + "legendFormat": "Failures" } ], "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Attempts" + }, + "properties": [ { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "green" + } } ] }, - "unit": "none", - "custom": { - "displayMode": "auto" - }, - "decimals": 0 - }, - "overrides": [] + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "legend": { + "displayMode": "table", + "placement": "right" }, - "textMode": "value" + "tooltip": { + "mode": "multi" + } }, "links": [ { @@ -1820,52 +1861,52 @@ "targets": [ { "refId": "A", - "expr": "100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d]))) / clamp_min((sum(increase(ariadne_task_runs_total[30d]))), 1)", + "expr": "(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval]))) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval]))), 1)) and on() ((sum(increase(ariadne_task_runs_total[$__interval]))) > 0)", "legendFormat": "ariadne" }, { "refId": "B", - "expr": "100 * ((sum(increase(metis_builds_total{status=\"ok\"}[30d])) + sum(increase(metis_flashes_total{status=\"ok\"}[30d])))) / clamp_min(((sum(increase(metis_builds_total[30d])) + sum(increase(metis_flashes_total[30d])))), 1)", + "expr": "(100 * ((sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) + sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])))) / clamp_min(((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval])))), 1)) and on() (((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval])))) > 0)", "legendFormat": "metis" }, { "refId": "C", - "expr": "100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[30d]))) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval]))) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval]))), 1)) and on() ((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval]))) > 0)", "legendFormat": "ananke" }, { "refId": "D", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[$__interval]))) > 0)", "legendFormat": "atlasbot" }, { "refId": "E", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[$__interval]))) > 0)", "legendFormat": "lesavka" }, { "refId": "F", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[$__interval]))) > 0)", "legendFormat": "pegasus" }, { "refId": "G", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[$__interval]))) > 0)", "legendFormat": "soteria" }, { "refId": "H", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[$__interval]))) > 0)", "legendFormat": "titan-iac" }, { "refId": "I", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[$__interval]))) > 0)", "legendFormat": "bstein-home" }, { "refId": "J", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[$__interval]))) > 0)", "legendFormat": "arcanagon" } ], @@ -1873,7 +1914,16 @@ "defaults": { "unit": "percent", "min": 0, - "max": 100 + "max": 100, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "fillOpacity": 10, + "showPoints": "always", + "pointSize": 4, + "spanNulls": true + } }, "overrides": [] }, @@ -1896,7 +1946,8 @@ "targetBlank": true } ], - "description": "Application-level rolling pass rate (0-100) over the last 30 days. Includes Ariadne/Metis/Ananke and auto-picks additional suite lines when platform_quality_gate_runs_total is emitted." + "timeFrom": "30d", + "description": "Per-run interval pass points (0-100) for each software suite over the last 30 days. Points are connected to show trend; missing-run intervals are ignored." }, { "id": 47, @@ -1914,7 +1965,7 @@ }, "targets": [ { - "expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h])) or on() vector(0))", + "expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h])) or on() vector(0)) + (sum(increase(platform_quality_gate_runs_total{status!~\"ok|passed|success\"}[24h])) or on() vector(0))", "refId": "A", "instant": true } @@ -1974,7 +2025,7 @@ "targetBlank": true } ], - "description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query." + "description": "Total failed test events in the last 24h across Ariadne, Metis, Ananke, and any suites publishing platform_quality_gate_runs_total." }, { "id": 30, diff --git a/services/monitoring/dashboards/atlas-power.json b/services/monitoring/dashboards/atlas-power.json index 1595f417..c094cef3 100644 --- a/services/monitoring/dashboards/atlas-power.json +++ b/services/monitoring/dashboards/atlas-power.json @@ -28,7 +28,7 @@ { "refId": "B", "expr": "max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Pyrphoros\"}) or on() vector(0)", - "legendFormat": "Pyrphoros Discharge ETA", + "legendFormat": "Pyrphoros Discharge", "instant": true }, { @@ -46,7 +46,7 @@ { "refId": "E", "expr": "max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Statera\"}) or on() vector(0)", - "legendFormat": "Statera Discharge ETA", + "legendFormat": "Statera Discharge", "instant": true }, { @@ -117,7 +117,7 @@ { "matcher": { "id": "byName", - "options": "Pyrphoros Discharge ETA" + "options": "Pyrphoros Discharge" }, "properties": [ { @@ -133,7 +133,7 @@ { "matcher": { "id": "byName", - "options": "Statera Discharge ETA" + "options": "Statera Discharge" }, "properties": [ { diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index f1a49b3e..b8350888 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -1262,52 +1262,52 @@ data: "targets": [ { "refId": "A", - "expr": "100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d]))) / clamp_min((sum(increase(ariadne_task_runs_total[30d]))), 1)", + "expr": "(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval]))) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval]))), 1)) and on() ((sum(increase(ariadne_task_runs_total[$__interval]))) > 0)", "legendFormat": "ariadne" }, { "refId": "B", - "expr": "100 * ((sum(increase(metis_builds_total{status=\"ok\"}[30d])) + sum(increase(metis_flashes_total{status=\"ok\"}[30d])))) / clamp_min(((sum(increase(metis_builds_total[30d])) + sum(increase(metis_flashes_total[30d])))), 1)", + "expr": "(100 * ((sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) + sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])))) / clamp_min(((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval])))), 1)) and on() (((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval])))) > 0)", "legendFormat": "metis" }, { "refId": "C", - "expr": "100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[30d]))) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval]))) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval]))), 1)) and on() ((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval]))) > 0)", "legendFormat": "ananke" }, { "refId": "D", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[$__interval]))) > 0)", "legendFormat": "atlasbot" }, { "refId": "E", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[$__interval]))) > 0)", "legendFormat": "lesavka" }, { "refId": "F", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[$__interval]))) > 0)", "legendFormat": "pegasus" }, { "refId": "G", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[$__interval]))) > 0)", "legendFormat": "soteria" }, { "refId": "H", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[$__interval]))) > 0)", "legendFormat": "titan-iac" }, { "refId": "I", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[$__interval]))) > 0)", "legendFormat": "bstein-home" }, { "refId": "J", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[$__interval]))) > 0)", "legendFormat": "arcanagon" } ], @@ -1315,7 +1315,16 @@ data: "defaults": { "unit": "percent", "min": 0, - "max": 100 + "max": 100, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "fillOpacity": 10, + "showPoints": "always", + "pointSize": 4, + "spanNulls": true + } }, "overrides": [] }, @@ -1328,7 +1337,7 @@ data: "mode": "multi" } }, - "description": "Application-level rolling pass percentage over the last 30 days. Existing suites: ariadne, metis, ananke; additional suites appear automatically when platform_quality_gate_runs_total is published." + "description": "Per-run interval pass points (0-100) per suite. Existing suites: ariadne, metis, ananke; additional suites appear automatically when platform_quality_gate_runs_total is published." } ], "time": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index ac10331c..55687a54 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1103,7 +1103,7 @@ data: { "refId": "B", "expr": "max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Pyrphoros\"}) or on() vector(0)", - "legendFormat": "Pyrphoros Discharge ETA", + "legendFormat": "Pyrphoros Discharge", "instant": true }, { @@ -1121,7 +1121,7 @@ data: { "refId": "E", "expr": "max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Statera\"}) or on() vector(0)", - "legendFormat": "Statera Discharge ETA", + "legendFormat": "Statera Discharge", "instant": true }, { @@ -1192,7 +1192,7 @@ data: { "matcher": { "id": "byName", - "options": "Pyrphoros Discharge ETA" + "options": "Pyrphoros Discharge" }, "properties": [ { @@ -1208,7 +1208,7 @@ data: { "matcher": { "id": "byName", - "options": "Statera Discharge ETA" + "options": "Statera Discharge" }, "properties": [ { @@ -1299,7 +1299,7 @@ data: "targetBlank": true } ], - "description": "Per-UPS live snapshot: current draw, discharge ETA, and charging/discharging status." + "description": "Per-UPS live snapshot: current draw, discharge, and charging/discharging status." }, { "id": 41, @@ -1687,8 +1687,8 @@ data: }, { "id": 44, - "type": "table", - "title": "One-off Job Pods >1h", + "type": "bargauge", + "title": "One-off Job Pods (age hours)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1701,38 +1701,61 @@ data: }, "targets": [ { - "expr": "(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})) > 1", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "h", - "custom": { - "filterable": true - } + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false, - "cellHeight": "sm" + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } }, + "links": [ + { + "title": "Open atlas-jobs dashboard", + "url": "/d/atlas-jobs", + "targetBlank": true + } + ], "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true - } - } - }, { "id": "sortBy", "options": { @@ -1741,13 +1764,19 @@ data: ], "order": "desc" } + }, + { + "id": "limit", + "options": { + "limit": 12 + } } ] }, { "id": 45, - "type": "stat", - "title": "Ariadne Attempts (24h)", + "type": "timeseries", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1760,49 +1789,61 @@ data: }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[24h]))", - "refId": "A" + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "B", + "legendFormat": "Failures" } ], "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Attempts" + }, + "properties": [ { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "green" + } } ] }, - "unit": "none", - "custom": { - "displayMode": "auto" - }, - "decimals": 0 - }, - "overrides": [] + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "legend": { + "displayMode": "table", + "placement": "right" }, - "textMode": "value" + "tooltip": { + "mode": "multi" + } }, "links": [ { @@ -1829,52 +1870,52 @@ data: "targets": [ { "refId": "A", - "expr": "100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d]))) / clamp_min((sum(increase(ariadne_task_runs_total[30d]))), 1)", + "expr": "(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval]))) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval]))), 1)) and on() ((sum(increase(ariadne_task_runs_total[$__interval]))) > 0)", "legendFormat": "ariadne" }, { "refId": "B", - "expr": "100 * ((sum(increase(metis_builds_total{status=\"ok\"}[30d])) + sum(increase(metis_flashes_total{status=\"ok\"}[30d])))) / clamp_min(((sum(increase(metis_builds_total[30d])) + sum(increase(metis_flashes_total[30d])))), 1)", + "expr": "(100 * ((sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) + sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])))) / clamp_min(((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval])))), 1)) and on() (((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval])))) > 0)", "legendFormat": "metis" }, { "refId": "C", - "expr": "100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[30d]))) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval]))) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval]))), 1)) and on() ((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval]))) > 0)", "legendFormat": "ananke" }, { "refId": "D", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[$__interval]))) > 0)", "legendFormat": "atlasbot" }, { "refId": "E", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[$__interval]))) > 0)", "legendFormat": "lesavka" }, { "refId": "F", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[$__interval]))) > 0)", "legendFormat": "pegasus" }, { "refId": "G", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[$__interval]))) > 0)", "legendFormat": "soteria" }, { "refId": "H", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[$__interval]))) > 0)", "legendFormat": "titan-iac" }, { "refId": "I", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[$__interval]))) > 0)", "legendFormat": "bstein-home" }, { "refId": "J", - "expr": "100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[30d]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[30d]))), 1)", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[$__interval]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[$__interval]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[$__interval]))) > 0)", "legendFormat": "arcanagon" } ], @@ -1882,7 +1923,16 @@ data: "defaults": { "unit": "percent", "min": 0, - "max": 100 + "max": 100, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "fillOpacity": 10, + "showPoints": "always", + "pointSize": 4, + "spanNulls": true + } }, "overrides": [] }, @@ -1905,7 +1955,8 @@ data: "targetBlank": true } ], - "description": "Application-level rolling pass rate (0-100) over the last 30 days. Includes Ariadne/Metis/Ananke and auto-picks additional suite lines when platform_quality_gate_runs_total is emitted." + "timeFrom": "30d", + "description": "Per-run interval pass points (0-100) for each software suite over the last 30 days. Points are connected to show trend; missing-run intervals are ignored." }, { "id": 47, @@ -1923,7 +1974,7 @@ data: }, "targets": [ { - "expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h])) or on() vector(0))", + "expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h])) or on() vector(0)) + (sum(increase(platform_quality_gate_runs_total{status!~\"ok|passed|success\"}[24h])) or on() vector(0))", "refId": "A", "instant": true } @@ -1983,7 +2034,7 @@ data: "targetBlank": true } ], - "description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query." + "description": "Total failed test events in the last 24h across Ariadne, Metis, Ananke, and any suites publishing platform_quality_gate_runs_total." }, { "id": 30, diff --git a/services/monitoring/grafana-dashboard-power.yaml b/services/monitoring/grafana-dashboard-power.yaml index e2b9b172..99884d29 100644 --- a/services/monitoring/grafana-dashboard-power.yaml +++ b/services/monitoring/grafana-dashboard-power.yaml @@ -37,7 +37,7 @@ data: { "refId": "B", "expr": "max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Pyrphoros\"}) or on() vector(0)", - "legendFormat": "Pyrphoros Discharge ETA", + "legendFormat": "Pyrphoros Discharge", "instant": true }, { @@ -55,7 +55,7 @@ data: { "refId": "E", "expr": "max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Statera\"}) or on() vector(0)", - "legendFormat": "Statera Discharge ETA", + "legendFormat": "Statera Discharge", "instant": true }, { @@ -126,7 +126,7 @@ data: { "matcher": { "id": "byName", - "options": "Pyrphoros Discharge ETA" + "options": "Pyrphoros Discharge" }, "properties": [ { @@ -142,7 +142,7 @@ data: { "matcher": { "id": "byName", - "options": "Statera Discharge ETA" + "options": "Statera Discharge" }, "properties": [ {