monitoring: replace failure table with 24h suite pass snapshot

This commit is contained in:
Brad Stein 2026-04-09 20:16:44 -03:00
parent 530f440679
commit 5f4641553c
3 changed files with 106 additions and 60 deletions

View File

@ -515,6 +515,31 @@ PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS = [
{"refId": "B", "expr": PLATFORM_TEST_SUCCESS_RATE_METIS_SERIES, "legendFormat": "metis"},
{"refId": "C", "expr": PLATFORM_TEST_SUCCESS_RATE_ANANKE_SERIES, "legendFormat": "ananke"},
] + PLATFORM_TEST_GENERIC_SUITE_TARGETS
PLATFORM_TEST_SUCCESS_RATE_24H_NATIVE_BY_SUITE = (
'label_replace('
'(100 * (sum(increase(ariadne_task_runs_total{status="ok"}[24h]))) / clamp_min((sum(increase(ariadne_task_runs_total[24h]))), 1)) '
'and on() ((sum(increase(ariadne_task_runs_total[24h]))) > 0), '
'"suite", "ariadne", "__name__", ".*") '
'or label_replace('
'(100 * ((sum(increase(metis_builds_total{status="ok"}[24h])) + sum(increase(metis_flashes_total{status="ok"}[24h])))) '
'/ clamp_min(((sum(increase(metis_builds_total[24h])) + sum(increase(metis_flashes_total[24h])))), 1)) '
'and on() (((sum(increase(metis_builds_total[24h])) + sum(increase(metis_flashes_total[24h])))) > 0), '
'"suite", "metis", "__name__", ".*") '
'or label_replace('
'(100 * (sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[24h]))) '
'/ clamp_min((sum(increase(ananke_quality_gate_runs_total{suite="ananke"}[24h]))), 1)) '
'and on() ((sum(increase(ananke_quality_gate_runs_total{suite="ananke"}[24h]))) > 0), '
'"suite", "ananke", "__name__", ".*")'
)
PLATFORM_TEST_SUCCESS_RATE_24H_GENERIC_BY_SUITE = (
'(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{status=~"ok|passed|success"}[24h]))) '
'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total[24h]))), 1)) '
'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total[24h]))) > 0)'
)
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
f'sort_desc(({PLATFORM_TEST_SUCCESS_RATE_24H_NATIVE_BY_SUITE}) or ({PLATFORM_TEST_SUCCESS_RATE_24H_GENERIC_BY_SUITE}))'
)
ANANKE_SELECTOR = 'job="ananke-power"'
ANANKE_UPS_DB_NAME = "Pyrphoros"
ANANKE_UPS_DB_NODE = "titan-db"
@ -1583,24 +1608,29 @@ def build_overview():
)
panels.append(test_success)
panels.append(
table_panel(
bargauge_panel(
47,
"Platform Test Failures (24h)",
PLATFORM_TEST_FAILURES_24H_BY_SUITE,
"Platform Suite Pass Rate (24h)",
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE,
{"h": 5, "w": 6, "x": 18, "y": 7},
unit="none",
unit="percent",
instant=True,
transformations=[
{"id": "labelsToFields", "options": {}},
{"id": "organize", "options": {"excludeByName": {"Time": True}}},
{"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}},
legend="{{suite}}",
sort_order="desc",
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "yellow", "value": 80},
{"color": "green", "value": 95},
],
options={"showHeader": True, "cellHeight": "sm"},
footer={"show": False},
},
)
)
panels[-1]["links"] = link_to("atlas-jobs")
panels[-1]["description"] = "Failures by suite in the last 24 hours. This is a per-suite breakdown, not a single opaque total."
panels[-1]["description"] = (
"24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
)
panels.append(
stat_panel(

View File

@ -1951,8 +1951,8 @@
},
{
"id": 47,
"type": "table",
"title": "Platform Test Failures (24h)",
"type": "bargauge",
"title": "Platform Suite Pass Rate (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1965,41 +1965,49 @@
},
"targets": [
{
"expr": "sort_desc(sum by (suite) (label_replace(increase(ariadne_task_runs_total{status!=\"ok\"}[24h]), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(increase(metis_builds_total{status=\"error\"}[24h]), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(increase(metis_flashes_total{status=\"error\"}[24h]), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h]), \"suite\", \"ananke\", \"__name__\", \".*\") or increase(platform_quality_gate_runs_total{status!~\"ok|passed|success\"}[24h])))",
"expr": "sort_desc((label_replace((100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[24h]))) / clamp_min((sum(increase(ariadne_task_runs_total[24h]))), 1)) and on() ((sum(increase(ariadne_task_runs_total[24h]))) > 0), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace((100 * ((sum(increase(metis_builds_total{status=\"ok\"}[24h])) + sum(increase(metis_flashes_total{status=\"ok\"}[24h])))) / clamp_min(((sum(increase(metis_builds_total[24h])) + sum(increase(metis_flashes_total[24h])))), 1)) and on() (((sum(increase(metis_builds_total[24h])) + sum(increase(metis_flashes_total[24h])))) > 0), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace((100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[24h]))) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[24h]))), 1)) and on() ((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[24h]))) > 0), \"suite\", \"ananke\", \"__name__\", \".*\")) or ((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total[24h]))) > 0)))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"custom": {
"filterable": true
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "yellow",
"value": 80
},
{
"color": "green",
"value": 95
}
]
}
},
"overrides": []
},
"options": {
"showHeader": true,
"columnFilters": false,
"cellHeight": "sm",
"footer": {
"show": false
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true
}
}
},
{
"id": "sortBy",
"options": {
@ -2017,7 +2025,7 @@
"targetBlank": true
}
],
"description": "Failures by suite in the last 24 hours. This is a per-suite breakdown, not a single opaque total."
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
},
{
"id": 30,

View File

@ -1960,8 +1960,8 @@ data:
},
{
"id": 47,
"type": "table",
"title": "Platform Test Failures (24h)",
"type": "bargauge",
"title": "Platform Suite Pass Rate (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1974,41 +1974,49 @@ data:
},
"targets": [
{
"expr": "sort_desc(sum by (suite) (label_replace(increase(ariadne_task_runs_total{status!=\"ok\"}[24h]), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(increase(metis_builds_total{status=\"error\"}[24h]), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(increase(metis_flashes_total{status=\"error\"}[24h]), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h]), \"suite\", \"ananke\", \"__name__\", \".*\") or increase(platform_quality_gate_runs_total{status!~\"ok|passed|success\"}[24h])))",
"expr": "sort_desc((label_replace((100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[24h]))) / clamp_min((sum(increase(ariadne_task_runs_total[24h]))), 1)) and on() ((sum(increase(ariadne_task_runs_total[24h]))) > 0), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace((100 * ((sum(increase(metis_builds_total{status=\"ok\"}[24h])) + sum(increase(metis_flashes_total{status=\"ok\"}[24h])))) / clamp_min(((sum(increase(metis_builds_total[24h])) + sum(increase(metis_flashes_total[24h])))), 1)) and on() (((sum(increase(metis_builds_total[24h])) + sum(increase(metis_flashes_total[24h])))) > 0), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace((100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[24h]))) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[24h]))), 1)) and on() ((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[24h]))) > 0), \"suite\", \"ananke\", \"__name__\", \".*\")) or ((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total[24h]))) > 0)))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"custom": {
"filterable": true
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "yellow",
"value": 80
},
{
"color": "green",
"value": 95
}
]
}
},
"overrides": []
},
"options": {
"showHeader": true,
"columnFilters": false,
"cellHeight": "sm",
"footer": {
"show": false
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true
}
}
},
{
"id": "sortBy",
"options": {
@ -2026,7 +2034,7 @@ data:
"targetBlank": true
}
],
"description": "Failures by suite in the last 24 hours. This is a per-suite breakdown, not a single opaque total."
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
},
{
"id": 30,