monitoring: refresh jobs dashboards
This commit is contained in:
parent
2e407e1962
commit
8b35ab0292
@ -337,16 +337,39 @@ GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))"
|
||||
GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})"
|
||||
GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})"
|
||||
ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
|
||||
ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))'
|
||||
ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))'
|
||||
ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
|
||||
ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
|
||||
ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
|
||||
ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
|
||||
ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
|
||||
ARIADNE_TASK_ATTEMPTS_1H = 'sum(increase(ariadne_task_runs_total[1h]))'
|
||||
ARIADNE_TASK_FAILURES_1H = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
|
||||
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
|
||||
ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
|
||||
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
|
||||
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
|
||||
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
|
||||
ARIADNE_TEST_SUCCESS_RATE = (
|
||||
"100 * "
|
||||
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[1h])) '
|
||||
"/ clamp_min("
|
||||
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[1h])), 1)'
|
||||
)
|
||||
ARIADNE_TEST_FAILURES_24H = (
|
||||
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
|
||||
)
|
||||
ONEOFF_JOB_OWNER = (
|
||||
'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")'
|
||||
)
|
||||
ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})'
|
||||
ONEOFF_JOB_POD_AGE_HOURS = (
|
||||
'((time() - kube_pod_start_time{pod!=""}) / 3600) '
|
||||
f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} '
|
||||
'* on(namespace,pod) group_left(phase) '
|
||||
'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})'
|
||||
)
|
||||
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
|
||||
GPU_NODE_REGEX = "|".join(GPU_NODES)
|
||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||
@ -798,6 +821,15 @@ def build_overview():
|
||||
{"color": "red", "value": 3},
|
||||
],
|
||||
}
|
||||
age_thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 6},
|
||||
{"color": "orange", "value": 24},
|
||||
{"color": "red", "value": 48},
|
||||
],
|
||||
}
|
||||
|
||||
row1_stats = [
|
||||
{
|
||||
@ -1000,7 +1032,7 @@ def build_overview():
|
||||
30,
|
||||
"Mail Sent (1d)",
|
||||
'max(postmark_outbound_sent{window="1d"})',
|
||||
{"h": 2, "w": 5, "x": 0, "y": 8},
|
||||
{"h": 3, "w": 5, "x": 0, "y": 8},
|
||||
unit="none",
|
||||
links=link_to("atlas-mail"),
|
||||
)
|
||||
@ -1011,7 +1043,7 @@ def build_overview():
|
||||
"type": "stat",
|
||||
"title": "Mail Bounces (1d)",
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": {"h": 2, "w": 5, "x": 10, "y": 8},
|
||||
"gridPos": {"h": 3, "w": 5, "x": 10, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
|
||||
@ -1057,7 +1089,7 @@ def build_overview():
|
||||
32,
|
||||
"Mail Success Rate (1d)",
|
||||
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
|
||||
{"h": 2, "w": 5, "x": 5, "y": 8},
|
||||
{"h": 3, "w": 5, "x": 5, "y": 8},
|
||||
unit="percent",
|
||||
thresholds=mail_success_thresholds,
|
||||
decimals=1,
|
||||
@ -1069,7 +1101,7 @@ def build_overview():
|
||||
33,
|
||||
"Mail Limit Used (30d)",
|
||||
"max(postmark_sending_limit_used_percent)",
|
||||
{"h": 2, "w": 5, "x": 15, "y": 8},
|
||||
{"h": 3, "w": 5, "x": 15, "y": 8},
|
||||
unit="percent",
|
||||
thresholds=mail_limit_thresholds,
|
||||
decimals=1,
|
||||
@ -1089,13 +1121,76 @@ def build_overview():
|
||||
panel_id,
|
||||
title,
|
||||
expr,
|
||||
{"h": 6, "w": 6, "x": 6 * idx, "y": 10},
|
||||
{"h": 5, "w": 6, "x": 6 * idx, "y": 11},
|
||||
unit=unit,
|
||||
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
||||
links=link_to("atlas-storage"),
|
||||
)
|
||||
)
|
||||
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
40,
|
||||
"One-off Job Pods (age hours)",
|
||||
ONEOFF_JOB_POD_AGE_HOURS,
|
||||
{"h": 6, "w": 4, "x": 0, "y": 16},
|
||||
unit="h",
|
||||
instant=True,
|
||||
legend="{{namespace}}/{{pod}}",
|
||||
thresholds=age_thresholds,
|
||||
limit=8,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
{
|
||||
"id": 41,
|
||||
"type": "timeseries",
|
||||
"title": "Ariadne Attempts vs Failures (1h)",
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": {"h": 6, "w": 8, "x": 4, "y": 16},
|
||||
"targets": [
|
||||
{"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"},
|
||||
{"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"},
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
|
||||
"options": {
|
||||
"legend": {"displayMode": "table", "placement": "right"},
|
||||
"tooltip": {"mode": "multi"},
|
||||
},
|
||||
}
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
42,
|
||||
"Ariadne Test Success Rate",
|
||||
ARIADNE_TEST_SUCCESS_RATE,
|
||||
{"h": 6, "w": 8, "x": 12, "y": 16},
|
||||
unit="percent",
|
||||
legend=None,
|
||||
legend_display="list",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
43,
|
||||
"Tests with Failures (24h)",
|
||||
ARIADNE_TEST_FAILURES_24H,
|
||||
{"h": 6, "w": 4, "x": 20, "y": 16},
|
||||
unit="none",
|
||||
instant=True,
|
||||
legend="{{result}}",
|
||||
thresholds={
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "orange", "value": 5},
|
||||
{"color": "red", "value": 10},
|
||||
],
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
cpu_scope = "$namespace_scope_cpu"
|
||||
gpu_scope = "$namespace_scope_gpu"
|
||||
ram_scope = "$namespace_scope_ram"
|
||||
@ -1105,7 +1200,7 @@ def build_overview():
|
||||
11,
|
||||
"Namespace CPU Share",
|
||||
namespace_cpu_share_expr(cpu_scope),
|
||||
{"h": 9, "w": 8, "x": 0, "y": 16},
|
||||
{"h": 9, "w": 8, "x": 0, "y": 22},
|
||||
links=namespace_scope_links("namespace_scope_cpu"),
|
||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
||||
)
|
||||
@ -1115,7 +1210,7 @@ def build_overview():
|
||||
12,
|
||||
"Namespace GPU Share",
|
||||
namespace_gpu_share_expr(gpu_scope),
|
||||
{"h": 9, "w": 8, "x": 8, "y": 16},
|
||||
{"h": 9, "w": 8, "x": 8, "y": 22},
|
||||
links=namespace_scope_links("namespace_scope_gpu"),
|
||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
||||
)
|
||||
@ -1125,7 +1220,7 @@ def build_overview():
|
||||
13,
|
||||
"Namespace RAM Share",
|
||||
namespace_ram_share_expr(ram_scope),
|
||||
{"h": 9, "w": 8, "x": 16, "y": 16},
|
||||
{"h": 9, "w": 8, "x": 16, "y": 22},
|
||||
links=namespace_scope_links("namespace_scope_ram"),
|
||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
||||
)
|
||||
@ -1137,7 +1232,7 @@ def build_overview():
|
||||
14,
|
||||
"Worker Node CPU",
|
||||
node_cpu_expr(worker_filter),
|
||||
{"h": 12, "w": 12, "x": 0, "y": 32},
|
||||
{"h": 12, "w": 12, "x": 0, "y": 38},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_calcs=["last"],
|
||||
@ -1151,7 +1246,7 @@ def build_overview():
|
||||
15,
|
||||
"Worker Node RAM",
|
||||
node_mem_expr(worker_filter),
|
||||
{"h": 12, "w": 12, "x": 12, "y": 32},
|
||||
{"h": 12, "w": 12, "x": 12, "y": 38},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_calcs=["last"],
|
||||
@ -1166,7 +1261,7 @@ def build_overview():
|
||||
16,
|
||||
"Control plane CPU",
|
||||
node_cpu_expr(CONTROL_ALL_REGEX),
|
||||
{"h": 10, "w": 12, "x": 0, "y": 44},
|
||||
{"h": 10, "w": 12, "x": 0, "y": 50},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_display="table",
|
||||
@ -1178,7 +1273,7 @@ def build_overview():
|
||||
17,
|
||||
"Control plane RAM",
|
||||
node_mem_expr(CONTROL_ALL_REGEX),
|
||||
{"h": 10, "w": 12, "x": 12, "y": 44},
|
||||
{"h": 10, "w": 12, "x": 12, "y": 50},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_display="table",
|
||||
@ -1191,7 +1286,7 @@ def build_overview():
|
||||
28,
|
||||
"Node Pod Share",
|
||||
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
|
||||
{"h": 10, "w": 12, "x": 0, "y": 54},
|
||||
{"h": 10, "w": 12, "x": 0, "y": 60},
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
@ -1199,7 +1294,7 @@ def build_overview():
|
||||
29,
|
||||
"Top Nodes by Pod Count",
|
||||
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
|
||||
{"h": 10, "w": 12, "x": 12, "y": 54},
|
||||
{"h": 10, "w": 12, "x": 12, "y": 60},
|
||||
unit="none",
|
||||
limit=12,
|
||||
decimals=0,
|
||||
@ -1221,7 +1316,7 @@ def build_overview():
|
||||
18,
|
||||
"Cluster Ingress Throughput",
|
||||
NET_INGRESS_EXPR,
|
||||
{"h": 7, "w": 8, "x": 0, "y": 25},
|
||||
{"h": 7, "w": 8, "x": 0, "y": 31},
|
||||
unit="Bps",
|
||||
legend="Ingress (Traefik)",
|
||||
legend_display="list",
|
||||
@ -1234,7 +1329,7 @@ def build_overview():
|
||||
19,
|
||||
"Cluster Egress Throughput",
|
||||
NET_EGRESS_EXPR,
|
||||
{"h": 7, "w": 8, "x": 8, "y": 25},
|
||||
{"h": 7, "w": 8, "x": 8, "y": 31},
|
||||
unit="Bps",
|
||||
legend="Egress (Traefik)",
|
||||
legend_display="list",
|
||||
@ -1247,7 +1342,7 @@ def build_overview():
|
||||
20,
|
||||
"Intra-Cluster Throughput",
|
||||
NET_INTERNAL_EXPR,
|
||||
{"h": 7, "w": 8, "x": 16, "y": 25},
|
||||
{"h": 7, "w": 8, "x": 16, "y": 31},
|
||||
unit="Bps",
|
||||
legend="Internal traffic",
|
||||
legend_display="list",
|
||||
@ -1261,7 +1356,7 @@ def build_overview():
|
||||
21,
|
||||
"Root Filesystem Usage",
|
||||
root_usage_expr(),
|
||||
{"h": 16, "w": 12, "x": 0, "y": 64},
|
||||
{"h": 16, "w": 12, "x": 0, "y": 70},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_calcs=["last"],
|
||||
@ -1276,7 +1371,7 @@ def build_overview():
|
||||
22,
|
||||
"Nodes Closest to Full Root Disks",
|
||||
f"topk(12, {root_usage_expr()})",
|
||||
{"h": 16, "w": 12, "x": 12, "y": 64},
|
||||
{"h": 16, "w": 12, "x": 12, "y": 70},
|
||||
unit="percent",
|
||||
thresholds=PERCENT_THRESHOLDS,
|
||||
links=link_to("atlas-storage"),
|
||||
@ -2171,7 +2266,7 @@ def build_mail_dashboard():
|
||||
}
|
||||
|
||||
|
||||
def build_testing_dashboard():
|
||||
def build_jobs_dashboard():
|
||||
panels = []
|
||||
age_thresholds = {
|
||||
"mode": "absolute",
|
||||
@ -2192,12 +2287,65 @@ def build_testing_dashboard():
|
||||
],
|
||||
}
|
||||
|
||||
task_error_thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "orange", "value": 3},
|
||||
{"color": "red", "value": 5},
|
||||
],
|
||||
}
|
||||
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
1,
|
||||
"Ariadne Task Errors (24h)",
|
||||
ARIADNE_TASK_ERRORS_24H,
|
||||
{"h": 7, "w": 6, "x": 0, "y": 0},
|
||||
unit="none",
|
||||
instant=True,
|
||||
legend="{{task}}",
|
||||
thresholds=task_error_thresholds,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
{
|
||||
"id": 2,
|
||||
"type": "timeseries",
|
||||
"title": "Ariadne Attempts vs Failures (1h)",
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": {"h": 7, "w": 12, "x": 6, "y": 0},
|
||||
"targets": [
|
||||
{"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"},
|
||||
{"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"},
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
|
||||
"options": {
|
||||
"legend": {"displayMode": "table", "placement": "right"},
|
||||
"tooltip": {"mode": "multi"},
|
||||
},
|
||||
}
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
3,
|
||||
"One-off Job Pods (age hours)",
|
||||
ONEOFF_JOB_POD_AGE_HOURS,
|
||||
{"h": 7, "w": 6, "x": 18, "y": 0},
|
||||
unit="h",
|
||||
instant=True,
|
||||
legend="{{namespace}}/{{pod}}",
|
||||
thresholds=age_thresholds,
|
||||
limit=12,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
1,
|
||||
4,
|
||||
"Glue Jobs Stale (>36h)",
|
||||
GLUE_STALE_COUNT,
|
||||
{"h": 4, "w": 6, "x": 0, "y": 0},
|
||||
{"h": 4, "w": 4, "x": 0, "y": 7},
|
||||
unit="none",
|
||||
thresholds={
|
||||
"mode": "absolute",
|
||||
@ -2212,99 +2360,47 @@ def build_testing_dashboard():
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
2,
|
||||
5,
|
||||
"Glue Jobs Missing Success",
|
||||
GLUE_MISSING_COUNT,
|
||||
{"h": 4, "w": 4, "x": 4, "y": 0},
|
||||
unit="none",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
3,
|
||||
"Glue Jobs Suspended",
|
||||
GLUE_SUSPENDED_COUNT,
|
||||
{"h": 4, "w": 4, "x": 8, "y": 0},
|
||||
unit="none",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
4,
|
||||
"Ariadne Task Errors (1h)",
|
||||
ARIADNE_TASK_ERRORS_1H_TOTAL,
|
||||
{"h": 4, "w": 4, "x": 12, "y": 0},
|
||||
unit="none",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
5,
|
||||
"Ariadne Task Errors (24h)",
|
||||
ARIADNE_TASK_ERRORS_24H_TOTAL,
|
||||
{"h": 4, "w": 4, "x": 16, "y": 0},
|
||||
{"h": 4, "w": 4, "x": 4, "y": 7},
|
||||
unit="none",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
6,
|
||||
"Ariadne Task Runs (1h)",
|
||||
ARIADNE_TASK_RUNS_1H_TOTAL,
|
||||
{"h": 4, "w": 4, "x": 20, "y": 0},
|
||||
"Glue Jobs Suspended",
|
||||
GLUE_SUSPENDED_COUNT,
|
||||
{"h": 4, "w": 4, "x": 8, "y": 7},
|
||||
unit="none",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
stat_panel(
|
||||
7,
|
||||
"Ariadne Task Runs vs Errors (1h)",
|
||||
ARIADNE_TASK_RUNS_BY_STATUS_1H,
|
||||
{"h": 6, "w": 24, "x": 0, "y": 4},
|
||||
"Ariadne Task Errors (1h)",
|
||||
ARIADNE_TASK_ERRORS_1H_TOTAL,
|
||||
{"h": 4, "w": 4, "x": 12, "y": 7},
|
||||
unit="none",
|
||||
legend="{{status}}",
|
||||
legend_display="table",
|
||||
legend_placement="right",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
stat_panel(
|
||||
8,
|
||||
"Ariadne Task Errors (24h)",
|
||||
ARIADNE_TASK_ERRORS_24H,
|
||||
{"h": 8, "w": 12, "x": 0, "y": 10},
|
||||
ARIADNE_TASK_ERRORS_24H_TOTAL,
|
||||
{"h": 4, "w": 4, "x": 16, "y": 7},
|
||||
unit="none",
|
||||
instant=True,
|
||||
legend="{{task}}",
|
||||
thresholds={
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "orange", "value": 3},
|
||||
{"color": "red", "value": 5},
|
||||
],
|
||||
},
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
stat_panel(
|
||||
9,
|
||||
"Ariadne Task Success (24h)",
|
||||
ARIADNE_TASK_SUCCESS_24H,
|
||||
{"h": 8, "w": 12, "x": 12, "y": 10},
|
||||
"Ariadne Task Runs (1h)",
|
||||
ARIADNE_TASK_RUNS_1H_TOTAL,
|
||||
{"h": 4, "w": 4, "x": 20, "y": 7},
|
||||
unit="none",
|
||||
instant=True,
|
||||
legend="{{task}}",
|
||||
thresholds={
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "red", "value": None},
|
||||
{"color": "orange", "value": 1},
|
||||
{"color": "yellow", "value": 5},
|
||||
{"color": "green", "value": 10},
|
||||
],
|
||||
},
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
@ -2312,7 +2408,7 @@ def build_testing_dashboard():
|
||||
10,
|
||||
"Ariadne Schedule Last Error (hours ago)",
|
||||
ARIADNE_SCHEDULE_LAST_ERROR_HOURS,
|
||||
{"h": 8, "w": 12, "x": 0, "y": 18},
|
||||
{"h": 8, "w": 12, "x": 0, "y": 11},
|
||||
unit="h",
|
||||
instant=True,
|
||||
legend="{{task}}",
|
||||
@ -2324,7 +2420,7 @@ def build_testing_dashboard():
|
||||
11,
|
||||
"Ariadne Schedule Last Success (hours ago)",
|
||||
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS,
|
||||
{"h": 8, "w": 12, "x": 12, "y": 18},
|
||||
{"h": 8, "w": 12, "x": 12, "y": 11},
|
||||
unit="h",
|
||||
instant=True,
|
||||
legend="{{task}}",
|
||||
@ -2336,7 +2432,7 @@ def build_testing_dashboard():
|
||||
12,
|
||||
"Glue Jobs Last Success (hours ago)",
|
||||
GLUE_LAST_SUCCESS_AGE_HOURS,
|
||||
{"h": 8, "w": 12, "x": 0, "y": 26},
|
||||
{"h": 8, "w": 12, "x": 0, "y": 19},
|
||||
unit="h",
|
||||
instant=True,
|
||||
legend="{{namespace}}/{{cronjob}}",
|
||||
@ -2348,7 +2444,7 @@ def build_testing_dashboard():
|
||||
13,
|
||||
"Glue Jobs Last Schedule (hours ago)",
|
||||
GLUE_LAST_SCHEDULE_AGE_HOURS,
|
||||
{"h": 8, "w": 12, "x": 12, "y": 26},
|
||||
{"h": 8, "w": 12, "x": 12, "y": 19},
|
||||
unit="h",
|
||||
instant=True,
|
||||
legend="{{namespace}}/{{cronjob}}",
|
||||
@ -2358,9 +2454,33 @@ def build_testing_dashboard():
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
14,
|
||||
"Ariadne Task Errors (1h)",
|
||||
ARIADNE_TASK_ERRORS_1H,
|
||||
{"h": 8, "w": 12, "x": 0, "y": 27},
|
||||
unit="none",
|
||||
instant=True,
|
||||
legend="{{task}}",
|
||||
thresholds=task_error_thresholds,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
15,
|
||||
"Ariadne Task Errors (30d)",
|
||||
ARIADNE_TASK_ERRORS_30D,
|
||||
{"h": 8, "w": 12, "x": 12, "y": 27},
|
||||
unit="none",
|
||||
instant=True,
|
||||
legend="{{task}}",
|
||||
thresholds=task_error_thresholds,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
16,
|
||||
"Ariadne Access Requests",
|
||||
ARIADNE_ACCESS_REQUESTS,
|
||||
{"h": 6, "w": 8, "x": 0, "y": 34},
|
||||
{"h": 6, "w": 8, "x": 0, "y": 35},
|
||||
unit="none",
|
||||
instant=True,
|
||||
legend="{{status}}",
|
||||
@ -2368,10 +2488,10 @@ def build_testing_dashboard():
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
15,
|
||||
17,
|
||||
"Ariadne CI Coverage (%)",
|
||||
ARIADNE_CI_COVERAGE,
|
||||
{"h": 6, "w": 4, "x": 8, "y": 34},
|
||||
{"h": 6, "w": 4, "x": 8, "y": 35},
|
||||
unit="percent",
|
||||
decimals=1,
|
||||
instant=True,
|
||||
@ -2380,10 +2500,10 @@ def build_testing_dashboard():
|
||||
)
|
||||
panels.append(
|
||||
table_panel(
|
||||
16,
|
||||
18,
|
||||
"Ariadne CI Tests (latest)",
|
||||
ARIADNE_CI_TESTS,
|
||||
{"h": 6, "w": 12, "x": 12, "y": 34},
|
||||
{"h": 6, "w": 12, "x": 12, "y": 35},
|
||||
unit="none",
|
||||
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
|
||||
instant=True,
|
||||
@ -2391,8 +2511,8 @@ def build_testing_dashboard():
|
||||
)
|
||||
|
||||
return {
|
||||
"uid": "atlas-testing",
|
||||
"title": "Atlas Testing",
|
||||
"uid": "atlas-jobs",
|
||||
"title": "Atlas Jobs",
|
||||
"folderUid": PRIVATE_FOLDER,
|
||||
"editable": True,
|
||||
"panels": panels,
|
||||
@ -2400,7 +2520,7 @@ def build_testing_dashboard():
|
||||
"annotations": {"list": []},
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["atlas", "testing"],
|
||||
"tags": ["atlas", "jobs", "glue"],
|
||||
}
|
||||
|
||||
|
||||
@ -2497,9 +2617,9 @@ DASHBOARDS = {
|
||||
"builder": build_mail_dashboard,
|
||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
|
||||
},
|
||||
"atlas-testing": {
|
||||
"builder": build_testing_dashboard,
|
||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml",
|
||||
"atlas-jobs": {
|
||||
"builder": build_jobs_dashboard,
|
||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml",
|
||||
},
|
||||
"atlas-gpu": {
|
||||
"builder": build_gpu_dashboard,
|
||||
|
||||
@ -270,7 +270,7 @@ spec:
|
||||
- name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
|
||||
value: "30 4 * * *"
|
||||
- name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
|
||||
value: "*/15 * * * *"
|
||||
value: "0 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_WGER_USER_SYNC
|
||||
value: "0 5 * * *"
|
||||
- name: ARIADNE_SCHEDULE_WGER_ADMIN
|
||||
@ -286,11 +286,11 @@ spec:
|
||||
- name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
|
||||
value: "30 4 * * 0"
|
||||
- name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
|
||||
value: "*/15 * * * *"
|
||||
value: "0 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_VAULT_OIDC
|
||||
value: "*/15 * * * *"
|
||||
value: "0 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
|
||||
value: "*/1 * * * *"
|
||||
value: "*/5 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
|
||||
value: "*/30 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -795,7 +795,7 @@
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 2,
|
||||
"h": 3,
|
||||
"w": 5,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
@ -862,7 +862,7 @@
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 2,
|
||||
"h": 3,
|
||||
"w": 5,
|
||||
"x": 10,
|
||||
"y": 8
|
||||
@ -967,7 +967,7 @@
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 2,
|
||||
"h": 3,
|
||||
"w": 5,
|
||||
"x": 5,
|
||||
"y": 8
|
||||
@ -1043,7 +1043,7 @@
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 2,
|
||||
"h": 3,
|
||||
"w": 5,
|
||||
"x": 15,
|
||||
"y": 8
|
||||
@ -1119,10 +1119,10 @@
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
"y": 11
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1194,10 +1194,10 @@
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 10
|
||||
"y": 11
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1269,10 +1269,10 @@
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 10
|
||||
"y": 11
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1336,10 +1336,10 @@
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 10
|
||||
"y": 11
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1394,6 +1394,238 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 40,
|
||||
"type": "bargauge",
|
||||
"title": "One-off Job Pods (age hours)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 4,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}/{{pod}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "h",
|
||||
"min": 0,
|
||||
"max": null,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 6
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 24
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 48
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"fields": [
|
||||
"Value"
|
||||
],
|
||||
"order": "desc"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "limit",
|
||||
"options": {
|
||||
"limit": 8
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 41,
|
||||
"type": "timeseries",
|
||||
"title": "Ariadne Attempts vs Failures (1h)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 4,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(ariadne_task_runs_total[1h]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Attempts"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "Failures"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 42,
|
||||
"type": "timeseries",
|
||||
"title": "Ariadne Test Success Rate",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 43,
|
||||
"type": "bargauge",
|
||||
"title": "Tests with Failures (24h)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 4,
|
||||
"x": 20,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{result}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"min": 0,
|
||||
"max": null,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 5
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"fields": [
|
||||
"Value"
|
||||
],
|
||||
"order": "desc"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "piechart",
|
||||
@ -1406,7 +1638,7 @@
|
||||
"h": 9,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
"y": 22
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1475,7 +1707,7 @@
|
||||
"h": 9,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 16
|
||||
"y": 22
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1544,7 +1776,7 @@
|
||||
"h": 9,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 16
|
||||
"y": 22
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1613,7 +1845,7 @@
|
||||
"h": 12,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
"y": 38
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1660,7 +1892,7 @@
|
||||
"h": 12,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 32
|
||||
"y": 38
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1707,7 +1939,7 @@
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 44
|
||||
"y": 50
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1744,7 +1976,7 @@
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 44
|
||||
"y": 50
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1781,7 +2013,7 @@
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 54
|
||||
"y": 60
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1832,7 +2064,7 @@
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 54
|
||||
"y": 60
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1913,7 +2145,7 @@
|
||||
"h": 7,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 25
|
||||
"y": 31
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1957,7 +2189,7 @@
|
||||
"h": 7,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 25
|
||||
"y": 31
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -2001,7 +2233,7 @@
|
||||
"h": 7,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 25
|
||||
"y": 31
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -2045,7 +2277,7 @@
|
||||
"h": 16,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 64
|
||||
"y": 70
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -2093,7 +2325,7 @@
|
||||
"h": 16,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 64
|
||||
"y": 70
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -804,7 +804,7 @@ data:
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 2,
|
||||
"h": 3,
|
||||
"w": 5,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
@ -871,7 +871,7 @@ data:
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 2,
|
||||
"h": 3,
|
||||
"w": 5,
|
||||
"x": 10,
|
||||
"y": 8
|
||||
@ -976,7 +976,7 @@ data:
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 2,
|
||||
"h": 3,
|
||||
"w": 5,
|
||||
"x": 5,
|
||||
"y": 8
|
||||
@ -1052,7 +1052,7 @@ data:
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 2,
|
||||
"h": 3,
|
||||
"w": 5,
|
||||
"x": 15,
|
||||
"y": 8
|
||||
@ -1128,10 +1128,10 @@ data:
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
"y": 11
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1203,10 +1203,10 @@ data:
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 10
|
||||
"y": 11
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1278,10 +1278,10 @@ data:
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 10
|
||||
"y": 11
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1345,10 +1345,10 @@ data:
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 10
|
||||
"y": 11
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1403,6 +1403,238 @@ data:
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 40,
|
||||
"type": "bargauge",
|
||||
"title": "One-off Job Pods (age hours)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 4,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}/{{pod}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "h",
|
||||
"min": 0,
|
||||
"max": null,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 6
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 24
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 48
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"fields": [
|
||||
"Value"
|
||||
],
|
||||
"order": "desc"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "limit",
|
||||
"options": {
|
||||
"limit": 8
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 41,
|
||||
"type": "timeseries",
|
||||
"title": "Ariadne Attempts vs Failures (1h)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 4,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(ariadne_task_runs_total[1h]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Attempts"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "Failures"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 42,
|
||||
"type": "timeseries",
|
||||
"title": "Ariadne Test Success Rate",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 43,
|
||||
"type": "bargauge",
|
||||
"title": "Tests with Failures (24h)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 4,
|
||||
"x": 20,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{result}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"min": 0,
|
||||
"max": null,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 5
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"fields": [
|
||||
"Value"
|
||||
],
|
||||
"order": "desc"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "piechart",
|
||||
@ -1415,7 +1647,7 @@ data:
|
||||
"h": 9,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
"y": 22
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1484,7 +1716,7 @@ data:
|
||||
"h": 9,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 16
|
||||
"y": 22
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1553,7 +1785,7 @@ data:
|
||||
"h": 9,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 16
|
||||
"y": 22
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1622,7 +1854,7 @@ data:
|
||||
"h": 12,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
"y": 38
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1669,7 +1901,7 @@ data:
|
||||
"h": 12,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 32
|
||||
"y": 38
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1716,7 +1948,7 @@ data:
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 44
|
||||
"y": 50
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1753,7 +1985,7 @@ data:
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 44
|
||||
"y": 50
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1790,7 +2022,7 @@ data:
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 54
|
||||
"y": 60
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1841,7 +2073,7 @@ data:
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 54
|
||||
"y": 60
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1922,7 +2154,7 @@ data:
|
||||
"h": 7,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 25
|
||||
"y": 31
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -1966,7 +2198,7 @@ data:
|
||||
"h": 7,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 25
|
||||
"y": 31
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -2010,7 +2242,7 @@ data:
|
||||
"h": 7,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 25
|
||||
"y": 31
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -2054,7 +2286,7 @@ data:
|
||||
"h": 16,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 64
|
||||
"y": 70
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -2102,7 +2334,7 @@ data:
|
||||
"h": 16,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 64
|
||||
"y": 70
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
|
||||
@ -471,14 +471,14 @@ spec:
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards/mail
|
||||
- name: testing
|
||||
- name: jobs
|
||||
orgId: 1
|
||||
folder: Atlas Internal
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards/testing
|
||||
path: /var/lib/grafana/dashboards/jobs
|
||||
dashboardsConfigMaps:
|
||||
overview: grafana-dashboard-overview
|
||||
overview-public: grafana-dashboard-overview
|
||||
@ -488,7 +488,7 @@ spec:
|
||||
gpu: grafana-dashboard-gpu
|
||||
network: grafana-dashboard-network
|
||||
mail: grafana-dashboard-mail
|
||||
testing: grafana-dashboard-testing
|
||||
jobs: grafana-dashboard-jobs
|
||||
extraConfigmapMounts:
|
||||
- name: grafana-folders
|
||||
mountPath: /etc/grafana/provisioning/folders
|
||||
|
||||
@ -14,7 +14,7 @@ resources:
|
||||
- grafana-dashboard-network.yaml
|
||||
- grafana-dashboard-gpu.yaml
|
||||
- grafana-dashboard-mail.yaml
|
||||
- grafana-dashboard-testing.yaml
|
||||
- grafana-dashboard-jobs.yaml
|
||||
- dcgm-exporter.yaml
|
||||
- jetson-tegrastats-exporter.yaml
|
||||
- postmark-exporter-service.yaml
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user