monitoring: refresh testing dashboard

This commit is contained in:
Brad Stein 2026-01-21 11:29:29 -03:00
parent a9f6b04baa
commit 698b2fd96b
4 changed files with 1401 additions and 592 deletions

1
.gitignore vendored
View File

@ -6,4 +6,5 @@ __pycache__/
*.py[cod] *.py[cod]
.pytest_cache .pytest_cache
.venv .venv
.venv-ci
tmp/ tmp/

View File

@ -339,6 +339,9 @@ GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})"
ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))' ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600" ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
@ -696,8 +699,10 @@ def bargauge_panel(
grid, grid,
*, *,
unit="none", unit="none",
legend=None,
links=None, links=None,
limit=None, limit=None,
sort_order="desc",
thresholds=None, thresholds=None,
decimals=None, decimals=None,
instant=False, instant=False,
@ -710,7 +715,12 @@ def bargauge_panel(
"datasource": PROM_DS, "datasource": PROM_DS,
"gridPos": grid, "gridPos": grid,
"targets": [ "targets": [
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})} {
"expr": expr,
"refId": "A",
"legendFormat": legend or "{{node}}",
**({"instant": True} if instant else {}),
}
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
@ -748,7 +758,7 @@ def bargauge_panel(
panel["transformations"] = [ panel["transformations"] = [
{ {
"id": "sortBy", "id": "sortBy",
"options": {"fields": ["Value"], "order": "desc"}, "options": {"fields": ["Value"], "order": sort_order},
} }
] ]
if limit: if limit:
@ -2163,7 +2173,24 @@ def build_mail_dashboard():
def build_testing_dashboard(): def build_testing_dashboard():
panels = [] panels = []
sort_desc = [{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}] age_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 6},
{"color": "orange", "value": 24},
{"color": "red", "value": 48},
],
}
recent_error_thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 1},
{"color": "yellow", "value": 6},
{"color": "green", "value": 24},
],
}
panels.append( panels.append(
stat_panel( stat_panel(
@ -2184,66 +2211,56 @@ def build_testing_dashboard():
) )
) )
panels.append( panels.append(
table_panel( stat_panel(
2, 2,
"Glue Jobs Missing Success", "Glue Jobs Missing Success",
GLUE_MISSING_ACTIVE, GLUE_MISSING_COUNT,
{"h": 4, "w": 6, "x": 6, "y": 0}, {"h": 4, "w": 4, "x": 4, "y": 0},
unit="none", unit="none",
transformations=sort_desc,
instant=True,
) )
) )
panels.append( panels.append(
table_panel( stat_panel(
3, 3,
"Glue Jobs Suspended", "Glue Jobs Suspended",
GLUE_SUSPENDED, GLUE_SUSPENDED_COUNT,
{"h": 4, "w": 6, "x": 12, "y": 0}, {"h": 4, "w": 4, "x": 8, "y": 0},
unit="none", unit="none",
transformations=sort_desc,
instant=True,
) )
) )
panels.append( panels.append(
table_panel( stat_panel(
4, 4,
"Glue Jobs Active Runs", "Ariadne Task Errors (1h)",
GLUE_ACTIVE, ARIADNE_TASK_ERRORS_1H_TOTAL,
{"h": 4, "w": 6, "x": 18, "y": 0}, {"h": 4, "w": 4, "x": 12, "y": 0},
unit="none", unit="none",
transformations=sort_desc,
instant=True,
) )
) )
panels.append( panels.append(
table_panel( stat_panel(
5, 5,
"Glue Jobs Last Success (hours ago)", "Ariadne Task Errors (24h)",
GLUE_LAST_SUCCESS_AGE_HOURS, ARIADNE_TASK_ERRORS_24H_TOTAL,
{"h": 8, "w": 12, "x": 0, "y": 4}, {"h": 4, "w": 4, "x": 16, "y": 0},
unit="h", unit="none",
transformations=sort_desc,
instant=True,
) )
) )
panels.append( panels.append(
table_panel( stat_panel(
6, 6,
"Glue Jobs Last Schedule (hours ago)", "Ariadne Task Runs (1h)",
GLUE_LAST_SCHEDULE_AGE_HOURS, ARIADNE_TASK_RUNS_1H_TOTAL,
{"h": 8, "w": 12, "x": 12, "y": 4}, {"h": 4, "w": 4, "x": 20, "y": 0},
unit="h", unit="none",
transformations=sort_desc,
instant=True,
) )
) )
panels.append( panels.append(
timeseries_panel( timeseries_panel(
12, 7,
"Ariadne Task Runs vs Errors (1h)", "Ariadne Task Runs vs Errors (1h)",
ARIADNE_TASK_RUNS_BY_STATUS_1H, ARIADNE_TASK_RUNS_BY_STATUS_1H,
{"h": 6, "w": 24, "x": 0, "y": 12}, {"h": 6, "w": 24, "x": 0, "y": 4},
unit="none", unit="none",
legend="{{status}}", legend="{{status}}",
legend_display="table", legend_display="table",
@ -2251,55 +2268,110 @@ def build_testing_dashboard():
) )
) )
panels.append( panels.append(
table_panel( bargauge_panel(
7, 8,
"Ariadne Task Errors (24h)", "Ariadne Task Errors (24h)",
ARIADNE_TASK_ERRORS_24H, ARIADNE_TASK_ERRORS_24H,
{"h": 6, "w": 12, "x": 0, "y": 18}, {"h": 8, "w": 12, "x": 0, "y": 10},
unit="none", unit="none",
transformations=sort_desc,
instant=True, instant=True,
legend="{{task}}",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 3},
{"color": "red", "value": 5},
],
},
) )
) )
panels.append( panels.append(
table_panel( bargauge_panel(
8,
"Ariadne Schedule Last Success (hours ago)",
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS,
{"h": 6, "w": 12, "x": 12, "y": 18},
unit="h",
transformations=sort_desc,
instant=True,
)
)
panels.append(
table_panel(
9, 9,
"Ariadne Access Requests", "Ariadne Task Success (24h)",
ARIADNE_ACCESS_REQUESTS, ARIADNE_TASK_SUCCESS_24H,
{"h": 6, "w": 12, "x": 12, "y": 24}, {"h": 8, "w": 12, "x": 12, "y": 10},
unit="none", unit="none",
transformations=sort_desc,
instant=True, instant=True,
legend="{{task}}",
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 1},
{"color": "yellow", "value": 5},
{"color": "green", "value": 10},
],
},
) )
) )
panels.append( panels.append(
table_panel( bargauge_panel(
13, 10,
"Ariadne Schedule Last Error (hours ago)", "Ariadne Schedule Last Error (hours ago)",
ARIADNE_SCHEDULE_LAST_ERROR_HOURS, ARIADNE_SCHEDULE_LAST_ERROR_HOURS,
{"h": 6, "w": 12, "x": 0, "y": 24}, {"h": 8, "w": 12, "x": 0, "y": 18},
unit="h", unit="h",
transformations=sort_desc,
instant=True, instant=True,
legend="{{task}}",
thresholds=recent_error_thresholds,
)
)
panels.append(
bargauge_panel(
11,
"Ariadne Schedule Last Success (hours ago)",
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS,
{"h": 8, "w": 12, "x": 12, "y": 18},
unit="h",
instant=True,
legend="{{task}}",
thresholds=age_thresholds,
)
)
panels.append(
bargauge_panel(
12,
"Glue Jobs Last Success (hours ago)",
GLUE_LAST_SUCCESS_AGE_HOURS,
{"h": 8, "w": 12, "x": 0, "y": 26},
unit="h",
instant=True,
legend="{{namespace}}/{{cronjob}}",
thresholds=age_thresholds,
)
)
panels.append(
bargauge_panel(
13,
"Glue Jobs Last Schedule (hours ago)",
GLUE_LAST_SCHEDULE_AGE_HOURS,
{"h": 8, "w": 12, "x": 12, "y": 26},
unit="h",
instant=True,
legend="{{namespace}}/{{cronjob}}",
thresholds=age_thresholds,
)
)
panels.append(
bargauge_panel(
14,
"Ariadne Access Requests",
ARIADNE_ACCESS_REQUESTS,
{"h": 6, "w": 8, "x": 0, "y": 34},
unit="none",
instant=True,
legend="{{status}}",
) )
) )
panels.append( panels.append(
stat_panel( stat_panel(
10, 15,
"Ariadne CI Coverage (%)", "Ariadne CI Coverage (%)",
ARIADNE_CI_COVERAGE, ARIADNE_CI_COVERAGE,
{"h": 4, "w": 6, "x": 0, "y": 30}, {"h": 6, "w": 4, "x": 8, "y": 34},
unit="percent", unit="percent",
decimals=1, decimals=1,
instant=True, instant=True,
@ -2308,12 +2380,12 @@ def build_testing_dashboard():
) )
panels.append( panels.append(
table_panel( table_panel(
11, 16,
"Ariadne CI Tests (latest)", "Ariadne CI Tests (latest)",
ARIADNE_CI_TESTS, ARIADNE_CI_TESTS,
{"h": 6, "w": 18, "x": 6, "y": 30}, {"h": 6, "w": 12, "x": 12, "y": 34},
unit="none", unit="none",
transformations=sort_desc, transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
instant=True, instant=True,
) )
) )

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff