monitoring: replace failure table with 24h suite pass snapshot

This commit is contained in:
Brad Stein 2026-04-09 20:16:44 -03:00
parent 530f440679
commit 5f4641553c
3 changed files with 106 additions and 60 deletions

View File

@ -515,6 +515,31 @@ PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS = [
{"refId": "B", "expr": PLATFORM_TEST_SUCCESS_RATE_METIS_SERIES, "legendFormat": "metis"}, {"refId": "B", "expr": PLATFORM_TEST_SUCCESS_RATE_METIS_SERIES, "legendFormat": "metis"},
{"refId": "C", "expr": PLATFORM_TEST_SUCCESS_RATE_ANANKE_SERIES, "legendFormat": "ananke"}, {"refId": "C", "expr": PLATFORM_TEST_SUCCESS_RATE_ANANKE_SERIES, "legendFormat": "ananke"},
] + PLATFORM_TEST_GENERIC_SUITE_TARGETS ] + PLATFORM_TEST_GENERIC_SUITE_TARGETS
PLATFORM_TEST_SUCCESS_RATE_24H_NATIVE_BY_SUITE = (
'label_replace('
'(100 * (sum(increase(ariadne_task_runs_total{status="ok"}[24h]))) / clamp_min((sum(increase(ariadne_task_runs_total[24h]))), 1)) '
'and on() ((sum(increase(ariadne_task_runs_total[24h]))) > 0), '
'"suite", "ariadne", "__name__", ".*") '
'or label_replace('
'(100 * ((sum(increase(metis_builds_total{status="ok"}[24h])) + sum(increase(metis_flashes_total{status="ok"}[24h])))) '
'/ clamp_min(((sum(increase(metis_builds_total[24h])) + sum(increase(metis_flashes_total[24h])))), 1)) '
'and on() (((sum(increase(metis_builds_total[24h])) + sum(increase(metis_flashes_total[24h])))) > 0), '
'"suite", "metis", "__name__", ".*") '
'or label_replace('
'(100 * (sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[24h]))) '
'/ clamp_min((sum(increase(ananke_quality_gate_runs_total{suite="ananke"}[24h]))), 1)) '
'and on() ((sum(increase(ananke_quality_gate_runs_total{suite="ananke"}[24h]))) > 0), '
'"suite", "ananke", "__name__", ".*")'
)
PLATFORM_TEST_SUCCESS_RATE_24H_GENERIC_BY_SUITE = (
'(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{status=~"ok|passed|success"}[24h]))) '
'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total[24h]))), 1)) '
'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total[24h]))) > 0)'
)
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
f'sort_desc(({PLATFORM_TEST_SUCCESS_RATE_24H_NATIVE_BY_SUITE}) or ({PLATFORM_TEST_SUCCESS_RATE_24H_GENERIC_BY_SUITE}))'
)
ANANKE_SELECTOR = 'job="ananke-power"' ANANKE_SELECTOR = 'job="ananke-power"'
ANANKE_UPS_DB_NAME = "Pyrphoros" ANANKE_UPS_DB_NAME = "Pyrphoros"
ANANKE_UPS_DB_NODE = "titan-db" ANANKE_UPS_DB_NODE = "titan-db"
@ -1583,24 +1608,29 @@ def build_overview():
) )
panels.append(test_success) panels.append(test_success)
panels.append( panels.append(
table_panel( bargauge_panel(
47, 47,
"Platform Test Failures (24h)", "Platform Suite Pass Rate (24h)",
PLATFORM_TEST_FAILURES_24H_BY_SUITE, PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE,
{"h": 5, "w": 6, "x": 18, "y": 7}, {"h": 5, "w": 6, "x": 18, "y": 7},
unit="none", unit="percent",
instant=True, instant=True,
transformations=[ legend="{{suite}}",
{"id": "labelsToFields", "options": {}}, sort_order="desc",
{"id": "organize", "options": {"excludeByName": {"Time": True}}}, thresholds={
{"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}, "mode": "absolute",
], "steps": [
options={"showHeader": True, "cellHeight": "sm"}, {"color": "red", "value": None},
footer={"show": False}, {"color": "yellow", "value": 80},
{"color": "green", "value": 95},
],
},
) )
) )
panels[-1]["links"] = link_to("atlas-jobs") panels[-1]["links"] = link_to("atlas-jobs")
panels[-1]["description"] = "Failures by suite in the last 24 hours. This is a per-suite breakdown, not a single opaque total." panels[-1]["description"] = (
"24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
)
panels.append( panels.append(
stat_panel( stat_panel(

View File

@ -1951,8 +1951,8 @@
}, },
{ {
"id": 47, "id": 47,
"type": "table", "type": "bargauge",
"title": "Platform Test Failures (24h)", "title": "Platform Suite Pass Rate (24h)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1965,41 +1965,49 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(sum by (suite) (label_replace(increase(ariadne_task_runs_total{status!=\"ok\"}[24h]), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(increase(metis_builds_total{status=\"error\"}[24h]), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(increase(metis_flashes_total{status=\"error\"}[24h]), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h]), \"suite\", \"ananke\", \"__name__\", \".*\") or increase(platform_quality_gate_runs_total{status!~\"ok|passed|success\"}[24h])))", "expr": "sort_desc((label_replace((100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[24h]))) / clamp_min((sum(increase(ariadne_task_runs_total[24h]))), 1)) and on() ((sum(increase(ariadne_task_runs_total[24h]))) > 0), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace((100 * ((sum(increase(metis_builds_total{status=\"ok\"}[24h])) + sum(increase(metis_flashes_total{status=\"ok\"}[24h])))) / clamp_min(((sum(increase(metis_builds_total[24h])) + sum(increase(metis_flashes_total[24h])))), 1)) and on() (((sum(increase(metis_builds_total[24h])) + sum(increase(metis_flashes_total[24h])))) > 0), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace((100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[24h]))) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[24h]))), 1)) and on() ((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[24h]))) > 0), \"suite\", \"ananke\", \"__name__\", \".*\")) or ((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total[24h]))) > 0)))",
"refId": "A", "refId": "A",
"legendFormat": "{{suite}}",
"instant": true "instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "none", "unit": "percent",
"custom": { "min": 0,
"filterable": true "max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "yellow",
"value": 80
},
{
"color": "green",
"value": 95
}
]
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"showHeader": true, "displayMode": "gradient",
"columnFilters": false, "orientation": "horizontal",
"cellHeight": "sm", "reduceOptions": {
"footer": { "calcs": [
"show": false "lastNotNull"
],
"fields": "",
"values": false
} }
}, },
"transformations": [ "transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true
}
}
},
{ {
"id": "sortBy", "id": "sortBy",
"options": { "options": {
@ -2017,7 +2025,7 @@
"targetBlank": true "targetBlank": true
} }
], ],
"description": "Failures by suite in the last 24 hours. This is a per-suite breakdown, not a single opaque total." "description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
}, },
{ {
"id": 30, "id": 30,

View File

@ -1960,8 +1960,8 @@ data:
}, },
{ {
"id": 47, "id": 47,
"type": "table", "type": "bargauge",
"title": "Platform Test Failures (24h)", "title": "Platform Suite Pass Rate (24h)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1974,41 +1974,49 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(sum by (suite) (label_replace(increase(ariadne_task_runs_total{status!=\"ok\"}[24h]), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(increase(metis_builds_total{status=\"error\"}[24h]), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(increase(metis_flashes_total{status=\"error\"}[24h]), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h]), \"suite\", \"ananke\", \"__name__\", \".*\") or increase(platform_quality_gate_runs_total{status!~\"ok|passed|success\"}[24h])))", "expr": "sort_desc((label_replace((100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[24h]))) / clamp_min((sum(increase(ariadne_task_runs_total[24h]))), 1)) and on() ((sum(increase(ariadne_task_runs_total[24h]))) > 0), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace((100 * ((sum(increase(metis_builds_total{status=\"ok\"}[24h])) + sum(increase(metis_flashes_total{status=\"ok\"}[24h])))) / clamp_min(((sum(increase(metis_builds_total[24h])) + sum(increase(metis_flashes_total[24h])))), 1)) and on() (((sum(increase(metis_builds_total[24h])) + sum(increase(metis_flashes_total[24h])))) > 0), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace((100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[24h]))) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[24h]))), 1)) and on() ((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[24h]))) > 0), \"suite\", \"ananke\", \"__name__\", \".*\")) or ((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total[24h]))) > 0)))",
"refId": "A", "refId": "A",
"legendFormat": "{{suite}}",
"instant": true "instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "none", "unit": "percent",
"custom": { "min": 0,
"filterable": true "max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "yellow",
"value": 80
},
{
"color": "green",
"value": 95
}
]
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"showHeader": true, "displayMode": "gradient",
"columnFilters": false, "orientation": "horizontal",
"cellHeight": "sm", "reduceOptions": {
"footer": { "calcs": [
"show": false "lastNotNull"
],
"fields": "",
"values": false
} }
}, },
"transformations": [ "transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true
}
}
},
{ {
"id": "sortBy", "id": "sortBy",
"options": { "options": {
@ -2026,7 +2034,7 @@ data:
"targetBlank": true "targetBlank": true
} }
], ],
"description": "Failures by suite in the last 24 hours. This is a per-suite breakdown, not a single opaque total." "description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
}, },
{ {
"id": 30, "id": 30,