From ab903e5619bddd9a240e2f0a9be504dc8c51796a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 4 Apr 2026 01:33:15 -0300 Subject: [PATCH] monitoring(overview): place six power/climate panels on one row and fix test/job data --- scripts/dashboards_render_atlas.py | 134 ++++++++------ .../monitoring/dashboards/atlas-jobs.json | 15 +- .../monitoring/dashboards/atlas-overview.json | 175 ++++++++---------- .../monitoring/grafana-dashboard-jobs.yaml | 15 +- .../grafana-dashboard-overview.yaml | 175 ++++++++---------- 5 files changed, 246 insertions(+), 268 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index b1b3cb22..2aa96237 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -419,17 +419,28 @@ ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = ( "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600" ) ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" -TEST_REPO_SELECTOR = 'repo=~"ariadne|metis"' -TEST_CI_COVERAGE = f'ariadne_ci_coverage_percent{{{TEST_REPO_SELECTOR}}}' -TEST_CI_TESTS = f'ariadne_ci_tests_total{{{TEST_REPO_SELECTOR}}}' -TEST_SUCCESS_RATE = ( - "100 * " - f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result="passed"}}[30d])) ' - "/ clamp_min(" - f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"passed|failed|error"}}[30d])), 1)' +PLATFORM_TEST_SUCCESS_EVENTS_30D = ( + '(sum(increase(ariadne_task_runs_total{status="ok"}[30d])) or on() vector(0)) + ' + '(sum(increase(metis_builds_total{status="ok"}[30d])) or on() vector(0)) + ' + '(sum(increase(metis_flashes_total{status="ok"}[30d])) or on() vector(0))' ) -TEST_FAILURES_24H = ( - f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))' +PLATFORM_TEST_TOTAL_EVENTS_30D = ( + "(sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + " + "(sum(increase(metis_builds_total[30d])) or on() vector(0)) + " + "(sum(increase(metis_flashes_total[30d])) or on() vector(0))" +) +TEST_SUCCESS_RATE = ( + f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_30D}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_30D}), 1)" +) +TEST_FAILURES_24H_TOTAL = ( + '(sum(increase(ariadne_task_runs_total{status!="ok"}[24h])) or on() vector(0)) + ' + '(sum(increase(metis_builds_total{status="error"}[24h])) or on() vector(0)) + ' + '(sum(increase(metis_flashes_total{status="error"}[24h])) or on() vector(0))' +) +PLATFORM_TEST_ACTIVITY_30D = ( + 'label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), "source", "ariadne", "__name__", ".*") ' + 'or label_replace(sum by (status) (increase(metis_builds_total[30d])), "source", "metis-build", "__name__", ".*") ' + 'or label_replace(sum by (status) (increase(metis_flashes_total[30d])), "source", "metis-flash", "__name__", ".*")' ) HECATE_SELECTOR = 'job="hecate-power"' HECATE_UPS_ON_BATTERY = f"sum(hecate_ups_on_battery{{{HECATE_SELECTOR}}}) or on() vector(0)" @@ -1267,7 +1278,7 @@ def build_overview(): 40, "UPS Current Load", None, - {"h": 5, "w": 8, "x": 0, "y": 11}, + {"h": 6, "w": 4, "x": 0, "y": 11}, unit="none", decimals=1, text_mode="name_and_value", @@ -1296,15 +1307,15 @@ def build_overview(): 41, "UPS History (Power Draw)", None, - {"h": 5, "w": 8, "x": 8, "y": 11}, + {"h": 6, "w": 4, "x": 4, "y": 11}, unit="watt", targets=[ {"refId": "A", "expr": HECATE_UPS_DRAW_WATTS_DB_SERIES, "legendFormat": "titan-db"}, {"refId": "B", "expr": HECATE_UPS_DRAW_WATTS_TETHYS_SERIES, "legendFormat": "tethys"}, {"refId": "C", "expr": HECATE_UPS_DRAW_WATTS_TOTAL_SERIES, "legendFormat": "combined"}, ], - legend_display="table", - legend_placement="right", + legend_display="list", + legend_placement="bottom", links=link_to("atlas-power"), ) ) @@ -1313,7 +1324,7 @@ def build_overview(): 42, "Current Climate", None, - {"h": 5, "w": 8, "x": 16, "y": 11}, + {"h": 6, "w": 4, "x": 8, "y": 11}, unit="none", decimals=2, text_mode="name_and_value", @@ -1334,7 +1345,7 @@ def build_overview(): 43, "Climate History", None, - {"h": 5, "w": 8, "x": 0, "y": 16}, + {"h": 6, "w": 4, "x": 12, "y": 11}, unit="celsius", targets=[ {"refId": "A", "expr": CLIMATE_TEMP_SERIES, "legendFormat": "Temperature (°C)"}, @@ -1351,8 +1362,8 @@ def build_overview(): ], } ], - legend_display="table", - legend_placement="right", + legend_display="list", + legend_placement="bottom", links=link_to("atlas-power"), ) ) @@ -1361,7 +1372,7 @@ def build_overview(): 140, "Fan Activity", None, - {"h": 5, "w": 8, "x": 8, "y": 16}, + {"h": 6, "w": 4, "x": 16, "y": 11}, unit="none", decimals=1, text_mode="name_and_value", @@ -1387,7 +1398,7 @@ def build_overview(): 141, "Fan History (0-10)", None, - {"h": 5, "w": 8, "x": 16, "y": 16}, + {"h": 6, "w": 4, "x": 20, "y": 11}, unit="none", max_value=10, targets=[ @@ -1396,22 +1407,26 @@ def build_overview(): {"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Outside Inlet"}, {"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior"}, ], - legend_display="table", - legend_placement="right", + legend_display="list", + legend_placement="bottom", links=link_to("atlas-power"), ) ) panels.append( - stat_panel( + table_panel( 44, "One-off Job Pods >1h", - f"sum(({ONEOFF_JOB_POD_AGE_HOURS}) > bool 1) or on() vector(0)", - {"h": 3, "w": 6, "x": 0, "y": 21}, - unit="none", + f"({ONEOFF_JOB_POD_AGE_HOURS}) > 1", + {"h": 3, "w": 6, "x": 0, "y": 17}, + unit="h", instant=True, - thresholds=count_thresholds, - links=link_to("atlas-jobs"), + transformations=[ + {"id": "labelsToFields", "options": {}}, + {"id": "organize", "options": {"excludeByName": {"Time": True}}}, + {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}, + ], + options={"showHeader": True, "cellHeight": "sm"}, ) ) panels.append( @@ -1419,7 +1434,7 @@ def build_overview(): 45, "Ariadne Attempts (24h)", "sum(increase(ariadne_task_runs_total[24h]))", - {"h": 3, "w": 6, "x": 6, "y": 21}, + {"h": 3, "w": 6, "x": 6, "y": 17}, unit="none", decimals=0, links=link_to("atlas-jobs"), @@ -1429,7 +1444,7 @@ def build_overview(): 46, "Platform Test Success Rate", TEST_SUCCESS_RATE, - {"h": 3, "w": 6, "x": 12, "y": 21}, + {"h": 3, "w": 6, "x": 12, "y": 17}, unit="percent", decimals=2, thresholds={ @@ -1451,8 +1466,8 @@ def build_overview(): test_failures = stat_panel( 47, "Platform Test Failures (24h)", - "sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h]))", - {"h": 3, "w": 6, "x": 18, "y": 21}, + TEST_FAILURES_24H_TOTAL, + {"h": 3, "w": 6, "x": 18, "y": 17}, unit="none", decimals=0, instant=True, @@ -1469,7 +1484,7 @@ def build_overview(): 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 3, "w": 4, "x": 0, "y": 24}, + {"h": 3, "w": 4, "x": 0, "y": 20}, unit="none", links=link_to("atlas-mail"), ) @@ -1480,7 +1495,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 3, "w": 4, "x": 8, "y": 24}, + "gridPos": {"h": 3, "w": 4, "x": 8, "y": 20}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1526,7 +1541,7 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 3, "w": 4, "x": 4, "y": 24}, + {"h": 3, "w": 4, "x": 4, "y": 20}, unit="percent", thresholds=mail_success_thresholds, decimals=1, @@ -1538,7 +1553,7 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 3, "w": 4, "x": 12, "y": 24}, + {"h": 3, "w": 4, "x": 12, "y": 20}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, @@ -1550,7 +1565,7 @@ def build_overview(): 34, "Postgres Connections Used", POSTGRES_CONN_USED, - {"h": 3, "w": 4, "x": 16, "y": 24}, + {"h": 3, "w": 4, "x": 16, "y": 20}, decimals=0, text_mode="name_and_value", legend="{{conn}}", @@ -1562,7 +1577,7 @@ def build_overview(): 35, "Postgres Hottest Connections", POSTGRES_CONN_HOTTEST, - {"h": 3, "w": 4, "x": 20, "y": 24}, + {"h": 3, "w": 4, "x": 20, "y": 20}, unit="none", decimals=0, text_mode="name_and_value", @@ -1580,7 +1595,7 @@ def build_overview(): 11, "Namespace CPU Share", namespace_cpu_share_expr(cpu_scope), - {"h": 9, "w": 8, "x": 0, "y": 27}, + {"h": 9, "w": 8, "x": 0, "y": 23}, links=namespace_scope_links("namespace_scope_cpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1590,7 +1605,7 @@ def build_overview(): 12, "Namespace GPU Share", namespace_gpu_share_expr(gpu_scope), - {"h": 9, "w": 8, "x": 8, "y": 27}, + {"h": 9, "w": 8, "x": 8, "y": 23}, links=namespace_scope_links("namespace_scope_gpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1600,7 +1615,7 @@ def build_overview(): 13, "Namespace RAM Share", namespace_ram_share_expr(ram_scope), - {"h": 9, "w": 8, "x": 16, "y": 27}, + {"h": 9, "w": 8, "x": 16, "y": 23}, links=namespace_scope_links("namespace_scope_ram"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1612,7 +1627,7 @@ def build_overview(): 14, "Worker Node CPU", node_cpu_expr(worker_filter), - {"h": 12, "w": 12, "x": 0, "y": 43}, + {"h": 12, "w": 12, "x": 0, "y": 39}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1626,7 +1641,7 @@ def build_overview(): 15, "Worker Node RAM", node_mem_expr(worker_filter), - {"h": 12, "w": 12, "x": 12, "y": 43}, + {"h": 12, "w": 12, "x": 12, "y": 39}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1641,7 +1656,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 0, "y": 55}, + {"h": 10, "w": 12, "x": 0, "y": 51}, unit="percent", legend="{{node}}", legend_display="table", @@ -1653,7 +1668,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 12, "y": 55}, + {"h": 10, "w": 12, "x": 12, "y": 51}, unit="percent", legend="{{node}}", legend_display="table", @@ -1666,7 +1681,7 @@ def build_overview(): 28, "Node Pod Share", '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', - {"h": 10, "w": 12, "x": 0, "y": 65}, + {"h": 10, "w": 12, "x": 0, "y": 61}, ) ) panels.append( @@ -1674,7 +1689,7 @@ def build_overview(): 29, "Top Nodes by Pod Count", 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', - {"h": 10, "w": 12, "x": 12, "y": 65}, + {"h": 10, "w": 12, "x": 12, "y": 61}, unit="none", limit=12, decimals=0, @@ -1696,7 +1711,7 @@ def build_overview(): 18, "Cluster Ingress Throughput", NET_INGRESS_EXPR, - {"h": 7, "w": 8, "x": 0, "y": 36}, + {"h": 7, "w": 8, "x": 0, "y": 32}, unit="Bps", legend="Ingress (Traefik)", legend_display="list", @@ -1709,7 +1724,7 @@ def build_overview(): 19, "Cluster Egress Throughput", NET_EGRESS_EXPR, - {"h": 7, "w": 8, "x": 8, "y": 36}, + {"h": 7, "w": 8, "x": 8, "y": 32}, unit="Bps", legend="Egress (Traefik)", legend_display="list", @@ -1722,7 +1737,7 @@ def build_overview(): 20, "Intra-Cluster Throughput", NET_INTERNAL_EXPR, - {"h": 7, "w": 8, "x": 16, "y": 36}, + {"h": 7, "w": 8, "x": 16, "y": 32}, unit="Bps", legend="Internal traffic", legend_display="list", @@ -1736,7 +1751,7 @@ def build_overview(): 21, "Root Filesystem Usage", root_usage_expr(), - {"h": 16, "w": 12, "x": 0, "y": 75}, + {"h": 16, "w": 12, "x": 0, "y": 71}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1751,7 +1766,7 @@ def build_overview(): 22, "Nodes Closest to Full Root Disks", f"topk(12, {root_usage_expr()})", - {"h": 16, "w": 12, "x": 12, "y": 75}, + {"h": 16, "w": 12, "x": 12, "y": 71}, unit="percent", thresholds=PERCENT_THRESHOLDS, links=link_to("atlas-storage"), @@ -2889,27 +2904,26 @@ def build_jobs_dashboard(): ) coverage_panel = stat_panel( 17, - "Platform CI Coverage (%)", - TEST_CI_COVERAGE, + "Platform Test Success Rate (30d)", + TEST_SUCCESS_RATE, {"h": 6, "w": 4, "x": 8, "y": 11}, unit="percent", - decimals=1, + decimals=2, instant=True, - legend="{{branch}}", ) - coverage_panel["description"] = "Internal source panel for Atlas Overview automation test rollups." + coverage_panel["description"] = "Internal rollup across Ariadne task runs and Metis build/flash outcomes." panels.append(coverage_panel) tests_panel = table_panel( 18, - "Platform CI Tests (Ariadne + Metis)", - TEST_CI_TESTS, + "Platform Test Activity (30d)", + PLATFORM_TEST_ACTIVITY_30D, {"h": 6, "w": 12, "x": 12, "y": 11}, unit="none", transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], instant=True, ) tests_panel["description"] = ( - "Atlas Overview test panels depend on these internal repo-tagged CI series from Ariadne and Metis." + "Atlas Overview test panels depend on this internal activity table sourced from Ariadne and Metis counters." ) panels.append(tests_panel) diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index b0a7307a..7799aec4 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -1125,7 +1125,7 @@ { "id": 17, "type": "stat", - "title": "Platform CI Coverage (%)", + "title": "Platform Test Success Rate (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1138,9 +1138,8 @@ }, "targets": [ { - "expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}", + "expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0))), 1)", "refId": "A", - "legendFormat": "{{branch}}", "instant": true } ], @@ -1167,7 +1166,7 @@ "custom": { "displayMode": "auto" }, - "decimals": 1 + "decimals": 2 }, "overrides": [] }, @@ -1184,12 +1183,12 @@ }, "textMode": "value" }, - "description": "Internal source panel for Atlas Overview automation test rollups." + "description": "Internal rollup across Ariadne task runs and Metis build/flash outcomes." }, { "id": 18, "type": "table", - "title": "Platform CI Tests (Ariadne + Metis)", + "title": "Platform Test Activity (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1202,7 +1201,7 @@ }, "targets": [ { - "expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}", + "expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\")", "refId": "A", "instant": true } @@ -1235,7 +1234,7 @@ } } ], - "description": "Atlas Overview test panels depend on these internal repo-tagged CI series from Ariadne and Metis." + "description": "Atlas Overview test panels depend on this internal activity table sourced from Ariadne and Metis counters." } ], "time": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 0fdc5fa3..b70dacae 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1079,8 +1079,8 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 8, + "h": 6, + "w": 4, "x": 0, "y": 11 }, @@ -1277,9 +1277,9 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 8, - "x": 8, + "h": 6, + "w": 4, + "x": 4, "y": 11 }, "targets": [ @@ -1307,8 +1307,8 @@ }, "options": { "legend": { - "displayMode": "table", - "placement": "right" + "displayMode": "list", + "placement": "bottom" }, "tooltip": { "mode": "multi" @@ -1331,9 +1331,9 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 8, - "x": 16, + "h": 6, + "w": 4, + "x": 8, "y": 11 }, "targets": [ @@ -1433,10 +1433,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 8, - "x": 0, - "y": 16 + "h": 6, + "w": 4, + "x": 12, + "y": 11 }, "targets": [ { @@ -1483,8 +1483,8 @@ }, "options": { "legend": { - "displayMode": "table", - "placement": "right" + "displayMode": "list", + "placement": "bottom" }, "tooltip": { "mode": "multi" @@ -1507,10 +1507,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 8, - "x": 8, - "y": 16 + "h": 6, + "w": 4, + "x": 16, + "y": 11 }, "targets": [ { @@ -1599,10 +1599,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 8, - "x": 16, - "y": 16 + "h": 6, + "w": 4, + "x": 20, + "y": 11 }, "targets": [ { @@ -1635,8 +1635,8 @@ }, "options": { "legend": { - "displayMode": "table", - "placement": "right" + "displayMode": "list", + "placement": "bottom" }, "tooltip": { "mode": "multi" @@ -1652,7 +1652,7 @@ }, { "id": 44, - "type": "stat", + "type": "table", "title": "One-off Job Pods >1h", "datasource": { "type": "prometheus", @@ -1662,67 +1662,50 @@ "h": 3, "w": 6, "x": 0, - "y": 21 + "y": 17 }, "targets": [ { - "expr": "sum((((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})) > bool 1) or on() vector(0)", + "expr": "(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})) > 1", "refId": "A", "instant": true } ], "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none", + "unit": "h", "custom": { - "displayMode": "auto" + "filterable": true } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" + "showHeader": true, + "columnFilters": false, + "cellHeight": "sm" }, - "links": [ + "transformations": [ { - "title": "Open atlas-jobs dashboard", - "url": "/d/atlas-jobs", - "targetBlank": true + "id": "labelsToFields", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + } + } + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } } ] }, @@ -1738,7 +1721,7 @@ "h": 3, "w": 6, "x": 6, - "y": 21 + "y": 17 }, "targets": [ { @@ -1806,11 +1789,11 @@ "h": 3, "w": 6, "x": 12, - "y": 21 + "y": 17 }, "targets": [ { - "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)", + "expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0))), 1)", "refId": "A" } ], @@ -1883,11 +1866,11 @@ "h": 3, "w": 6, "x": 18, - "y": 21 + "y": 17 }, "targets": [ { - "expr": "sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h]))", + "expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0))", "refId": "A", "instant": true } @@ -1961,7 +1944,7 @@ "h": 3, "w": 4, "x": 0, - "y": 24 + "y": 20 }, "targets": [ { @@ -2028,7 +2011,7 @@ "h": 3, "w": 4, "x": 8, - "y": 24 + "y": 20 }, "targets": [ { @@ -2133,7 +2116,7 @@ "h": 3, "w": 4, "x": 4, - "y": 24 + "y": 20 }, "targets": [ { @@ -2209,7 +2192,7 @@ "h": 3, "w": 4, "x": 12, - "y": 24 + "y": 20 }, "targets": [ { @@ -2285,7 +2268,7 @@ "h": 3, "w": 4, "x": 16, - "y": 24 + "y": 20 }, "targets": [ { @@ -2348,7 +2331,7 @@ "h": 3, "w": 4, "x": 20, - "y": 24 + "y": 20 }, "targets": [ { @@ -2411,7 +2394,7 @@ "h": 9, "w": 8, "x": 0, - "y": 27 + "y": 23 }, "targets": [ { @@ -2480,7 +2463,7 @@ "h": 9, "w": 8, "x": 8, - "y": 27 + "y": 23 }, "targets": [ { @@ -2549,7 +2532,7 @@ "h": 9, "w": 8, "x": 16, - "y": 27 + "y": 23 }, "targets": [ { @@ -2618,7 +2601,7 @@ "h": 12, "w": 12, "x": 0, - "y": 43 + "y": 39 }, "targets": [ { @@ -2665,7 +2648,7 @@ "h": 12, "w": 12, "x": 12, - "y": 43 + "y": 39 }, "targets": [ { @@ -2712,7 +2695,7 @@ "h": 10, "w": 12, "x": 0, - "y": 55 + "y": 51 }, "targets": [ { @@ -2749,7 +2732,7 @@ "h": 10, "w": 12, "x": 12, - "y": 55 + "y": 51 }, "targets": [ { @@ -2786,7 +2769,7 @@ "h": 10, "w": 12, "x": 0, - "y": 65 + "y": 61 }, "targets": [ { @@ -2837,7 +2820,7 @@ "h": 10, "w": 12, "x": 12, - "y": 65 + "y": 61 }, "targets": [ { @@ -2918,7 +2901,7 @@ "h": 7, "w": 8, "x": 0, - "y": 36 + "y": 32 }, "targets": [ { @@ -2962,7 +2945,7 @@ "h": 7, "w": 8, "x": 8, - "y": 36 + "y": 32 }, "targets": [ { @@ -3006,7 +2989,7 @@ "h": 7, "w": 8, "x": 16, - "y": 36 + "y": 32 }, "targets": [ { @@ -3050,7 +3033,7 @@ "h": 16, "w": 12, "x": 0, - "y": 75 + "y": 71 }, "targets": [ { @@ -3098,7 +3081,7 @@ "h": 16, "w": 12, "x": 12, - "y": 75 + "y": 71 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 9f37d00c..1dc455d4 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -1134,7 +1134,7 @@ data: { "id": 17, "type": "stat", - "title": "Platform CI Coverage (%)", + "title": "Platform Test Success Rate (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1147,9 +1147,8 @@ data: }, "targets": [ { - "expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}", + "expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0))), 1)", "refId": "A", - "legendFormat": "{{branch}}", "instant": true } ], @@ -1176,7 +1175,7 @@ data: "custom": { "displayMode": "auto" }, - "decimals": 1 + "decimals": 2 }, "overrides": [] }, @@ -1193,12 +1192,12 @@ data: }, "textMode": "value" }, - "description": "Internal source panel for Atlas Overview automation test rollups." + "description": "Internal rollup across Ariadne task runs and Metis build/flash outcomes." }, { "id": 18, "type": "table", - "title": "Platform CI Tests (Ariadne + Metis)", + "title": "Platform Test Activity (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1211,7 +1210,7 @@ data: }, "targets": [ { - "expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}", + "expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\")", "refId": "A", "instant": true } @@ -1244,7 +1243,7 @@ data: } } ], - "description": "Atlas Overview test panels depend on these internal repo-tagged CI series from Ariadne and Metis." + "description": "Atlas Overview test panels depend on this internal activity table sourced from Ariadne and Metis counters." } ], "time": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 5609a6ee..683cf20e 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1088,8 +1088,8 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 8, + "h": 6, + "w": 4, "x": 0, "y": 11 }, @@ -1286,9 +1286,9 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 8, - "x": 8, + "h": 6, + "w": 4, + "x": 4, "y": 11 }, "targets": [ @@ -1316,8 +1316,8 @@ data: }, "options": { "legend": { - "displayMode": "table", - "placement": "right" + "displayMode": "list", + "placement": "bottom" }, "tooltip": { "mode": "multi" @@ -1340,9 +1340,9 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 8, - "x": 16, + "h": 6, + "w": 4, + "x": 8, "y": 11 }, "targets": [ @@ -1442,10 +1442,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 8, - "x": 0, - "y": 16 + "h": 6, + "w": 4, + "x": 12, + "y": 11 }, "targets": [ { @@ -1492,8 +1492,8 @@ data: }, "options": { "legend": { - "displayMode": "table", - "placement": "right" + "displayMode": "list", + "placement": "bottom" }, "tooltip": { "mode": "multi" @@ -1516,10 +1516,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 8, - "x": 8, - "y": 16 + "h": 6, + "w": 4, + "x": 16, + "y": 11 }, "targets": [ { @@ -1608,10 +1608,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 8, - "x": 16, - "y": 16 + "h": 6, + "w": 4, + "x": 20, + "y": 11 }, "targets": [ { @@ -1644,8 +1644,8 @@ data: }, "options": { "legend": { - "displayMode": "table", - "placement": "right" + "displayMode": "list", + "placement": "bottom" }, "tooltip": { "mode": "multi" @@ -1661,7 +1661,7 @@ data: }, { "id": 44, - "type": "stat", + "type": "table", "title": "One-off Job Pods >1h", "datasource": { "type": "prometheus", @@ -1671,67 +1671,50 @@ data: "h": 3, "w": 6, "x": 0, - "y": 21 + "y": 17 }, "targets": [ { - "expr": "sum((((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})) > bool 1) or on() vector(0)", + "expr": "(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})) > 1", "refId": "A", "instant": true } ], "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none", + "unit": "h", "custom": { - "displayMode": "auto" + "filterable": true } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" + "showHeader": true, + "columnFilters": false, + "cellHeight": "sm" }, - "links": [ + "transformations": [ { - "title": "Open atlas-jobs dashboard", - "url": "/d/atlas-jobs", - "targetBlank": true + "id": "labelsToFields", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + } + } + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } } ] }, @@ -1747,7 +1730,7 @@ data: "h": 3, "w": 6, "x": 6, - "y": 21 + "y": 17 }, "targets": [ { @@ -1815,11 +1798,11 @@ data: "h": 3, "w": 6, "x": 12, - "y": 21 + "y": 17 }, "targets": [ { - "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)", + "expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0))), 1)", "refId": "A" } ], @@ -1892,11 +1875,11 @@ data: "h": 3, "w": 6, "x": 18, - "y": 21 + "y": 17 }, "targets": [ { - "expr": "sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h]))", + "expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0))", "refId": "A", "instant": true } @@ -1970,7 +1953,7 @@ data: "h": 3, "w": 4, "x": 0, - "y": 24 + "y": 20 }, "targets": [ { @@ -2037,7 +2020,7 @@ data: "h": 3, "w": 4, "x": 8, - "y": 24 + "y": 20 }, "targets": [ { @@ -2142,7 +2125,7 @@ data: "h": 3, "w": 4, "x": 4, - "y": 24 + "y": 20 }, "targets": [ { @@ -2218,7 +2201,7 @@ data: "h": 3, "w": 4, "x": 12, - "y": 24 + "y": 20 }, "targets": [ { @@ -2294,7 +2277,7 @@ data: "h": 3, "w": 4, "x": 16, - "y": 24 + "y": 20 }, "targets": [ { @@ -2357,7 +2340,7 @@ data: "h": 3, "w": 4, "x": 20, - "y": 24 + "y": 20 }, "targets": [ { @@ -2420,7 +2403,7 @@ data: "h": 9, "w": 8, "x": 0, - "y": 27 + "y": 23 }, "targets": [ { @@ -2489,7 +2472,7 @@ data: "h": 9, "w": 8, "x": 8, - "y": 27 + "y": 23 }, "targets": [ { @@ -2558,7 +2541,7 @@ data: "h": 9, "w": 8, "x": 16, - "y": 27 + "y": 23 }, "targets": [ { @@ -2627,7 +2610,7 @@ data: "h": 12, "w": 12, "x": 0, - "y": 43 + "y": 39 }, "targets": [ { @@ -2674,7 +2657,7 @@ data: "h": 12, "w": 12, "x": 12, - "y": 43 + "y": 39 }, "targets": [ { @@ -2721,7 +2704,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 55 + "y": 51 }, "targets": [ { @@ -2758,7 +2741,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 55 + "y": 51 }, "targets": [ { @@ -2795,7 +2778,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 65 + "y": 61 }, "targets": [ { @@ -2846,7 +2829,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 65 + "y": 61 }, "targets": [ { @@ -2927,7 +2910,7 @@ data: "h": 7, "w": 8, "x": 0, - "y": 36 + "y": 32 }, "targets": [ { @@ -2971,7 +2954,7 @@ data: "h": 7, "w": 8, "x": 8, - "y": 36 + "y": 32 }, "targets": [ { @@ -3015,7 +2998,7 @@ data: "h": 7, "w": 8, "x": 16, - "y": 36 + "y": 32 }, "targets": [ { @@ -3059,7 +3042,7 @@ data: "h": 16, "w": 12, "x": 0, - "y": 75 + "y": 71 }, "targets": [ { @@ -3107,7 +3090,7 @@ data: "h": 16, "w": 12, "x": 12, - "y": 75 + "y": 71 }, "targets": [ {