monitoring: refresh jobs dashboards

This commit is contained in:
Brad Stein 2026-01-21 13:37:36 -03:00
parent 2e407e1962
commit 8b35ab0292
8 changed files with 1946 additions and 1042 deletions

View File

@ -337,16 +337,39 @@ GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))"
GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})"
GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})"
ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))'
ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))'
ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))' ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))' ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
ARIADNE_TASK_ATTEMPTS_1H = 'sum(increase(ariadne_task_runs_total[1h]))'
ARIADNE_TASK_FAILURES_1H = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600" ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
ARIADNE_TEST_SUCCESS_RATE = (
"100 * "
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[1h])) '
"/ clamp_min("
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[1h])), 1)'
)
ARIADNE_TEST_FAILURES_24H = (
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
)
ONEOFF_JOB_OWNER = (
'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")'
)
ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})'
ONEOFF_JOB_POD_AGE_HOURS = (
'((time() - kube_pod_start_time{pod!=""}) / 3600) '
f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} '
'* on(namespace,pod) group_left(phase) '
'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})'
)
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
GPU_NODE_REGEX = "|".join(GPU_NODES) GPU_NODE_REGEX = "|".join(GPU_NODES)
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
@ -798,6 +821,15 @@ def build_overview():
{"color": "red", "value": 3}, {"color": "red", "value": 3},
], ],
} }
age_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 6},
{"color": "orange", "value": 24},
{"color": "red", "value": 48},
],
}
row1_stats = [ row1_stats = [
{ {
@ -1000,7 +1032,7 @@ def build_overview():
30, 30,
"Mail Sent (1d)", "Mail Sent (1d)",
'max(postmark_outbound_sent{window="1d"})', 'max(postmark_outbound_sent{window="1d"})',
{"h": 2, "w": 5, "x": 0, "y": 8}, {"h": 3, "w": 5, "x": 0, "y": 8},
unit="none", unit="none",
links=link_to("atlas-mail"), links=link_to("atlas-mail"),
) )
@ -1011,7 +1043,7 @@ def build_overview():
"type": "stat", "type": "stat",
"title": "Mail Bounces (1d)", "title": "Mail Bounces (1d)",
"datasource": PROM_DS, "datasource": PROM_DS,
"gridPos": {"h": 2, "w": 5, "x": 10, "y": 8}, "gridPos": {"h": 3, "w": 5, "x": 10, "y": 8},
"targets": [ "targets": [
{ {
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})', "expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
@ -1057,7 +1089,7 @@ def build_overview():
32, 32,
"Mail Success Rate (1d)", "Mail Success Rate (1d)",
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
{"h": 2, "w": 5, "x": 5, "y": 8}, {"h": 3, "w": 5, "x": 5, "y": 8},
unit="percent", unit="percent",
thresholds=mail_success_thresholds, thresholds=mail_success_thresholds,
decimals=1, decimals=1,
@ -1069,7 +1101,7 @@ def build_overview():
33, 33,
"Mail Limit Used (30d)", "Mail Limit Used (30d)",
"max(postmark_sending_limit_used_percent)", "max(postmark_sending_limit_used_percent)",
{"h": 2, "w": 5, "x": 15, "y": 8}, {"h": 3, "w": 5, "x": 15, "y": 8},
unit="percent", unit="percent",
thresholds=mail_limit_thresholds, thresholds=mail_limit_thresholds,
decimals=1, decimals=1,
@ -1089,13 +1121,76 @@ def build_overview():
panel_id, panel_id,
title, title,
expr, expr,
{"h": 6, "w": 6, "x": 6 * idx, "y": 10}, {"h": 5, "w": 6, "x": 6 * idx, "y": 11},
unit=unit, unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
links=link_to("atlas-storage"), links=link_to("atlas-storage"),
) )
) )
panels.append(
bargauge_panel(
40,
"One-off Job Pods (age hours)",
ONEOFF_JOB_POD_AGE_HOURS,
{"h": 6, "w": 4, "x": 0, "y": 16},
unit="h",
instant=True,
legend="{{namespace}}/{{pod}}",
thresholds=age_thresholds,
limit=8,
)
)
panels.append(
{
"id": 41,
"type": "timeseries",
"title": "Ariadne Attempts vs Failures (1h)",
"datasource": PROM_DS,
"gridPos": {"h": 6, "w": 8, "x": 4, "y": 16},
"targets": [
{"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"},
{"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"},
],
"fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
"options": {
"legend": {"displayMode": "table", "placement": "right"},
"tooltip": {"mode": "multi"},
},
}
)
panels.append(
timeseries_panel(
42,
"Ariadne Test Success Rate",
ARIADNE_TEST_SUCCESS_RATE,
{"h": 6, "w": 8, "x": 12, "y": 16},
unit="percent",
legend=None,
legend_display="list",
)
)
panels.append(
bargauge_panel(
43,
"Tests with Failures (24h)",
ARIADNE_TEST_FAILURES_24H,
{"h": 6, "w": 4, "x": 20, "y": 16},
unit="none",
instant=True,
legend="{{result}}",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 5},
{"color": "red", "value": 10},
],
},
)
)
cpu_scope = "$namespace_scope_cpu" cpu_scope = "$namespace_scope_cpu"
gpu_scope = "$namespace_scope_gpu" gpu_scope = "$namespace_scope_gpu"
ram_scope = "$namespace_scope_ram" ram_scope = "$namespace_scope_ram"
@ -1105,7 +1200,7 @@ def build_overview():
11, 11,
"Namespace CPU Share", "Namespace CPU Share",
namespace_cpu_share_expr(cpu_scope), namespace_cpu_share_expr(cpu_scope),
{"h": 9, "w": 8, "x": 0, "y": 16}, {"h": 9, "w": 8, "x": 0, "y": 22},
links=namespace_scope_links("namespace_scope_cpu"), links=namespace_scope_links("namespace_scope_cpu"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.", description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
) )
@ -1115,7 +1210,7 @@ def build_overview():
12, 12,
"Namespace GPU Share", "Namespace GPU Share",
namespace_gpu_share_expr(gpu_scope), namespace_gpu_share_expr(gpu_scope),
{"h": 9, "w": 8, "x": 8, "y": 16}, {"h": 9, "w": 8, "x": 8, "y": 22},
links=namespace_scope_links("namespace_scope_gpu"), links=namespace_scope_links("namespace_scope_gpu"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.", description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
) )
@ -1125,7 +1220,7 @@ def build_overview():
13, 13,
"Namespace RAM Share", "Namespace RAM Share",
namespace_ram_share_expr(ram_scope), namespace_ram_share_expr(ram_scope),
{"h": 9, "w": 8, "x": 16, "y": 16}, {"h": 9, "w": 8, "x": 16, "y": 22},
links=namespace_scope_links("namespace_scope_ram"), links=namespace_scope_links("namespace_scope_ram"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.", description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
) )
@ -1137,7 +1232,7 @@ def build_overview():
14, 14,
"Worker Node CPU", "Worker Node CPU",
node_cpu_expr(worker_filter), node_cpu_expr(worker_filter),
{"h": 12, "w": 12, "x": 0, "y": 32}, {"h": 12, "w": 12, "x": 0, "y": 38},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_calcs=["last"], legend_calcs=["last"],
@ -1151,7 +1246,7 @@ def build_overview():
15, 15,
"Worker Node RAM", "Worker Node RAM",
node_mem_expr(worker_filter), node_mem_expr(worker_filter),
{"h": 12, "w": 12, "x": 12, "y": 32}, {"h": 12, "w": 12, "x": 12, "y": 38},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_calcs=["last"], legend_calcs=["last"],
@ -1166,7 +1261,7 @@ def build_overview():
16, 16,
"Control plane CPU", "Control plane CPU",
node_cpu_expr(CONTROL_ALL_REGEX), node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 0, "y": 44}, {"h": 10, "w": 12, "x": 0, "y": 50},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_display="table", legend_display="table",
@ -1178,7 +1273,7 @@ def build_overview():
17, 17,
"Control plane RAM", "Control plane RAM",
node_mem_expr(CONTROL_ALL_REGEX), node_mem_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 12, "y": 44}, {"h": 10, "w": 12, "x": 12, "y": 50},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_display="table", legend_display="table",
@ -1191,7 +1286,7 @@ def build_overview():
28, 28,
"Node Pod Share", "Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 10, "w": 12, "x": 0, "y": 54}, {"h": 10, "w": 12, "x": 0, "y": 60},
) )
) )
panels.append( panels.append(
@ -1199,7 +1294,7 @@ def build_overview():
29, 29,
"Top Nodes by Pod Count", "Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 10, "w": 12, "x": 12, "y": 54}, {"h": 10, "w": 12, "x": 12, "y": 60},
unit="none", unit="none",
limit=12, limit=12,
decimals=0, decimals=0,
@ -1221,7 +1316,7 @@ def build_overview():
18, 18,
"Cluster Ingress Throughput", "Cluster Ingress Throughput",
NET_INGRESS_EXPR, NET_INGRESS_EXPR,
{"h": 7, "w": 8, "x": 0, "y": 25}, {"h": 7, "w": 8, "x": 0, "y": 31},
unit="Bps", unit="Bps",
legend="Ingress (Traefik)", legend="Ingress (Traefik)",
legend_display="list", legend_display="list",
@ -1234,7 +1329,7 @@ def build_overview():
19, 19,
"Cluster Egress Throughput", "Cluster Egress Throughput",
NET_EGRESS_EXPR, NET_EGRESS_EXPR,
{"h": 7, "w": 8, "x": 8, "y": 25}, {"h": 7, "w": 8, "x": 8, "y": 31},
unit="Bps", unit="Bps",
legend="Egress (Traefik)", legend="Egress (Traefik)",
legend_display="list", legend_display="list",
@ -1247,7 +1342,7 @@ def build_overview():
20, 20,
"Intra-Cluster Throughput", "Intra-Cluster Throughput",
NET_INTERNAL_EXPR, NET_INTERNAL_EXPR,
{"h": 7, "w": 8, "x": 16, "y": 25}, {"h": 7, "w": 8, "x": 16, "y": 31},
unit="Bps", unit="Bps",
legend="Internal traffic", legend="Internal traffic",
legend_display="list", legend_display="list",
@ -1261,7 +1356,7 @@ def build_overview():
21, 21,
"Root Filesystem Usage", "Root Filesystem Usage",
root_usage_expr(), root_usage_expr(),
{"h": 16, "w": 12, "x": 0, "y": 64}, {"h": 16, "w": 12, "x": 0, "y": 70},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_calcs=["last"], legend_calcs=["last"],
@ -1276,7 +1371,7 @@ def build_overview():
22, 22,
"Nodes Closest to Full Root Disks", "Nodes Closest to Full Root Disks",
f"topk(12, {root_usage_expr()})", f"topk(12, {root_usage_expr()})",
{"h": 16, "w": 12, "x": 12, "y": 64}, {"h": 16, "w": 12, "x": 12, "y": 70},
unit="percent", unit="percent",
thresholds=PERCENT_THRESHOLDS, thresholds=PERCENT_THRESHOLDS,
links=link_to("atlas-storage"), links=link_to("atlas-storage"),
@ -2171,7 +2266,7 @@ def build_mail_dashboard():
} }
def build_testing_dashboard(): def build_jobs_dashboard():
panels = [] panels = []
age_thresholds = { age_thresholds = {
"mode": "absolute", "mode": "absolute",
@ -2192,12 +2287,65 @@ def build_testing_dashboard():
], ],
} }
task_error_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 3},
{"color": "red", "value": 5},
],
}
panels.append(
bargauge_panel(
1,
"Ariadne Task Errors (24h)",
ARIADNE_TASK_ERRORS_24H,
{"h": 7, "w": 6, "x": 0, "y": 0},
unit="none",
instant=True,
legend="{{task}}",
thresholds=task_error_thresholds,
)
)
panels.append(
{
"id": 2,
"type": "timeseries",
"title": "Ariadne Attempts vs Failures (1h)",
"datasource": PROM_DS,
"gridPos": {"h": 7, "w": 12, "x": 6, "y": 0},
"targets": [
{"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"},
{"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"},
],
"fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
"options": {
"legend": {"displayMode": "table", "placement": "right"},
"tooltip": {"mode": "multi"},
},
}
)
panels.append(
bargauge_panel(
3,
"One-off Job Pods (age hours)",
ONEOFF_JOB_POD_AGE_HOURS,
{"h": 7, "w": 6, "x": 18, "y": 0},
unit="h",
instant=True,
legend="{{namespace}}/{{pod}}",
thresholds=age_thresholds,
limit=12,
)
)
panels.append( panels.append(
stat_panel( stat_panel(
1, 4,
"Glue Jobs Stale (>36h)", "Glue Jobs Stale (>36h)",
GLUE_STALE_COUNT, GLUE_STALE_COUNT,
{"h": 4, "w": 6, "x": 0, "y": 0}, {"h": 4, "w": 4, "x": 0, "y": 7},
unit="none", unit="none",
thresholds={ thresholds={
"mode": "absolute", "mode": "absolute",
@ -2212,99 +2360,47 @@ def build_testing_dashboard():
) )
panels.append( panels.append(
stat_panel( stat_panel(
2, 5,
"Glue Jobs Missing Success", "Glue Jobs Missing Success",
GLUE_MISSING_COUNT, GLUE_MISSING_COUNT,
{"h": 4, "w": 4, "x": 4, "y": 0}, {"h": 4, "w": 4, "x": 4, "y": 7},
unit="none",
)
)
panels.append(
stat_panel(
3,
"Glue Jobs Suspended",
GLUE_SUSPENDED_COUNT,
{"h": 4, "w": 4, "x": 8, "y": 0},
unit="none",
)
)
panels.append(
stat_panel(
4,
"Ariadne Task Errors (1h)",
ARIADNE_TASK_ERRORS_1H_TOTAL,
{"h": 4, "w": 4, "x": 12, "y": 0},
unit="none",
)
)
panels.append(
stat_panel(
5,
"Ariadne Task Errors (24h)",
ARIADNE_TASK_ERRORS_24H_TOTAL,
{"h": 4, "w": 4, "x": 16, "y": 0},
unit="none", unit="none",
) )
) )
panels.append( panels.append(
stat_panel( stat_panel(
6, 6,
"Ariadne Task Runs (1h)", "Glue Jobs Suspended",
ARIADNE_TASK_RUNS_1H_TOTAL, GLUE_SUSPENDED_COUNT,
{"h": 4, "w": 4, "x": 20, "y": 0}, {"h": 4, "w": 4, "x": 8, "y": 7},
unit="none", unit="none",
) )
) )
panels.append( panels.append(
timeseries_panel( stat_panel(
7, 7,
"Ariadne Task Runs vs Errors (1h)", "Ariadne Task Errors (1h)",
ARIADNE_TASK_RUNS_BY_STATUS_1H, ARIADNE_TASK_ERRORS_1H_TOTAL,
{"h": 6, "w": 24, "x": 0, "y": 4}, {"h": 4, "w": 4, "x": 12, "y": 7},
unit="none", unit="none",
legend="{{status}}",
legend_display="table",
legend_placement="right",
) )
) )
panels.append( panels.append(
bargauge_panel( stat_panel(
8, 8,
"Ariadne Task Errors (24h)", "Ariadne Task Errors (24h)",
ARIADNE_TASK_ERRORS_24H, ARIADNE_TASK_ERRORS_24H_TOTAL,
{"h": 8, "w": 12, "x": 0, "y": 10}, {"h": 4, "w": 4, "x": 16, "y": 7},
unit="none", unit="none",
instant=True,
legend="{{task}}",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 3},
{"color": "red", "value": 5},
],
},
) )
) )
panels.append( panels.append(
bargauge_panel( stat_panel(
9, 9,
"Ariadne Task Success (24h)", "Ariadne Task Runs (1h)",
ARIADNE_TASK_SUCCESS_24H, ARIADNE_TASK_RUNS_1H_TOTAL,
{"h": 8, "w": 12, "x": 12, "y": 10}, {"h": 4, "w": 4, "x": 20, "y": 7},
unit="none", unit="none",
instant=True,
legend="{{task}}",
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 1},
{"color": "yellow", "value": 5},
{"color": "green", "value": 10},
],
},
) )
) )
panels.append( panels.append(
@ -2312,7 +2408,7 @@ def build_testing_dashboard():
10, 10,
"Ariadne Schedule Last Error (hours ago)", "Ariadne Schedule Last Error (hours ago)",
ARIADNE_SCHEDULE_LAST_ERROR_HOURS, ARIADNE_SCHEDULE_LAST_ERROR_HOURS,
{"h": 8, "w": 12, "x": 0, "y": 18}, {"h": 8, "w": 12, "x": 0, "y": 11},
unit="h", unit="h",
instant=True, instant=True,
legend="{{task}}", legend="{{task}}",
@ -2324,7 +2420,7 @@ def build_testing_dashboard():
11, 11,
"Ariadne Schedule Last Success (hours ago)", "Ariadne Schedule Last Success (hours ago)",
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS,
{"h": 8, "w": 12, "x": 12, "y": 18}, {"h": 8, "w": 12, "x": 12, "y": 11},
unit="h", unit="h",
instant=True, instant=True,
legend="{{task}}", legend="{{task}}",
@ -2336,7 +2432,7 @@ def build_testing_dashboard():
12, 12,
"Glue Jobs Last Success (hours ago)", "Glue Jobs Last Success (hours ago)",
GLUE_LAST_SUCCESS_AGE_HOURS, GLUE_LAST_SUCCESS_AGE_HOURS,
{"h": 8, "w": 12, "x": 0, "y": 26}, {"h": 8, "w": 12, "x": 0, "y": 19},
unit="h", unit="h",
instant=True, instant=True,
legend="{{namespace}}/{{cronjob}}", legend="{{namespace}}/{{cronjob}}",
@ -2348,7 +2444,7 @@ def build_testing_dashboard():
13, 13,
"Glue Jobs Last Schedule (hours ago)", "Glue Jobs Last Schedule (hours ago)",
GLUE_LAST_SCHEDULE_AGE_HOURS, GLUE_LAST_SCHEDULE_AGE_HOURS,
{"h": 8, "w": 12, "x": 12, "y": 26}, {"h": 8, "w": 12, "x": 12, "y": 19},
unit="h", unit="h",
instant=True, instant=True,
legend="{{namespace}}/{{cronjob}}", legend="{{namespace}}/{{cronjob}}",
@ -2358,9 +2454,33 @@ def build_testing_dashboard():
panels.append( panels.append(
bargauge_panel( bargauge_panel(
14, 14,
"Ariadne Task Errors (1h)",
ARIADNE_TASK_ERRORS_1H,
{"h": 8, "w": 12, "x": 0, "y": 27},
unit="none",
instant=True,
legend="{{task}}",
thresholds=task_error_thresholds,
)
)
panels.append(
bargauge_panel(
15,
"Ariadne Task Errors (30d)",
ARIADNE_TASK_ERRORS_30D,
{"h": 8, "w": 12, "x": 12, "y": 27},
unit="none",
instant=True,
legend="{{task}}",
thresholds=task_error_thresholds,
)
)
panels.append(
bargauge_panel(
16,
"Ariadne Access Requests", "Ariadne Access Requests",
ARIADNE_ACCESS_REQUESTS, ARIADNE_ACCESS_REQUESTS,
{"h": 6, "w": 8, "x": 0, "y": 34}, {"h": 6, "w": 8, "x": 0, "y": 35},
unit="none", unit="none",
instant=True, instant=True,
legend="{{status}}", legend="{{status}}",
@ -2368,10 +2488,10 @@ def build_testing_dashboard():
) )
panels.append( panels.append(
stat_panel( stat_panel(
15, 17,
"Ariadne CI Coverage (%)", "Ariadne CI Coverage (%)",
ARIADNE_CI_COVERAGE, ARIADNE_CI_COVERAGE,
{"h": 6, "w": 4, "x": 8, "y": 34}, {"h": 6, "w": 4, "x": 8, "y": 35},
unit="percent", unit="percent",
decimals=1, decimals=1,
instant=True, instant=True,
@ -2380,10 +2500,10 @@ def build_testing_dashboard():
) )
panels.append( panels.append(
table_panel( table_panel(
16, 18,
"Ariadne CI Tests (latest)", "Ariadne CI Tests (latest)",
ARIADNE_CI_TESTS, ARIADNE_CI_TESTS,
{"h": 6, "w": 12, "x": 12, "y": 34}, {"h": 6, "w": 12, "x": 12, "y": 35},
unit="none", unit="none",
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
instant=True, instant=True,
@ -2391,8 +2511,8 @@ def build_testing_dashboard():
) )
return { return {
"uid": "atlas-testing", "uid": "atlas-jobs",
"title": "Atlas Testing", "title": "Atlas Jobs",
"folderUid": PRIVATE_FOLDER, "folderUid": PRIVATE_FOLDER,
"editable": True, "editable": True,
"panels": panels, "panels": panels,
@ -2400,7 +2520,7 @@ def build_testing_dashboard():
"annotations": {"list": []}, "annotations": {"list": []},
"schemaVersion": 39, "schemaVersion": 39,
"style": "dark", "style": "dark",
"tags": ["atlas", "testing"], "tags": ["atlas", "jobs", "glue"],
} }
@ -2497,9 +2617,9 @@ DASHBOARDS = {
"builder": build_mail_dashboard, "builder": build_mail_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml", "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
}, },
"atlas-testing": { "atlas-jobs": {
"builder": build_testing_dashboard, "builder": build_jobs_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml", "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml",
}, },
"atlas-gpu": { "atlas-gpu": {
"builder": build_gpu_dashboard, "builder": build_gpu_dashboard,

View File

@ -270,7 +270,7 @@ spec:
- name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
value: "30 4 * * *" value: "30 4 * * *"
- name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
value: "*/15 * * * *" value: "0 * * * *"
- name: ARIADNE_SCHEDULE_WGER_USER_SYNC - name: ARIADNE_SCHEDULE_WGER_USER_SYNC
value: "0 5 * * *" value: "0 5 * * *"
- name: ARIADNE_SCHEDULE_WGER_ADMIN - name: ARIADNE_SCHEDULE_WGER_ADMIN
@ -286,11 +286,11 @@ spec:
- name: ARIADNE_SCHEDULE_IMAGE_SWEEPER - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
value: "30 4 * * 0" value: "30 4 * * 0"
- name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
value: "*/15 * * * *" value: "0 * * * *"
- name: ARIADNE_SCHEDULE_VAULT_OIDC - name: ARIADNE_SCHEDULE_VAULT_OIDC
value: "*/15 * * * *" value: "0 * * * *"
- name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
value: "*/1 * * * *" value: "*/5 * * * *"
- name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
value: "*/30 * * * *" value: "*/30 * * * *"
- name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM - name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM

View File

@ -795,7 +795,7 @@
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 2, "h": 3,
"w": 5, "w": 5,
"x": 0, "x": 0,
"y": 8 "y": 8
@ -862,7 +862,7 @@
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 2, "h": 3,
"w": 5, "w": 5,
"x": 10, "x": 10,
"y": 8 "y": 8
@ -967,7 +967,7 @@
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 2, "h": 3,
"w": 5, "w": 5,
"x": 5, "x": 5,
"y": 8 "y": 8
@ -1043,7 +1043,7 @@
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 2, "h": 3,
"w": 5, "w": 5,
"x": 15, "x": 15,
"y": 8 "y": 8
@ -1119,10 +1119,10 @@
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 6, "h": 5,
"w": 6, "w": 6,
"x": 0, "x": 0,
"y": 10 "y": 11
}, },
"targets": [ "targets": [
{ {
@ -1194,10 +1194,10 @@
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 6, "h": 5,
"w": 6, "w": 6,
"x": 6, "x": 6,
"y": 10 "y": 11
}, },
"targets": [ "targets": [
{ {
@ -1269,10 +1269,10 @@
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 6, "h": 5,
"w": 6, "w": 6,
"x": 12, "x": 12,
"y": 10 "y": 11
}, },
"targets": [ "targets": [
{ {
@ -1336,10 +1336,10 @@
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 6, "h": 5,
"w": 6, "w": 6,
"x": 18, "x": 18,
"y": 10 "y": 11
}, },
"targets": [ "targets": [
{ {
@ -1394,6 +1394,238 @@
} }
] ]
}, },
{
"id": 40,
"type": "bargauge",
"title": "One-off Job Pods (age hours)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 4,
"x": 0,
"y": 16
},
"targets": [
{
"expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
"refId": "A",
"legendFormat": "{{namespace}}/{{pod}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "h",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 6
},
{
"color": "orange",
"value": 24
},
{
"color": "red",
"value": 48
}
]
}
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 8
}
}
]
},
{
"id": 41,
"type": "timeseries",
"title": "Ariadne Attempts vs Failures (1h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 4,
"y": 16
},
"targets": [
{
"expr": "sum(increase(ariadne_task_runs_total[1h]))",
"refId": "A",
"legendFormat": "Attempts"
},
{
"expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
"refId": "B",
"legendFormat": "Failures"
}
],
"fieldConfig": {
"defaults": {
"unit": "none"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 42,
"type": "timeseries",
"title": "Ariadne Test Success Rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 12,
"y": 16
},
"targets": [
{
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 43,
"type": "bargauge",
"title": "Tests with Failures (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 4,
"x": 20,
"y": 16
},
"targets": [
{
"expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))",
"refId": "A",
"legendFormat": "{{result}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 5
},
{
"color": "red",
"value": 10
}
]
}
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
}
]
},
{ {
"id": 11, "id": 11,
"type": "piechart", "type": "piechart",
@ -1406,7 +1638,7 @@
"h": 9, "h": 9,
"w": 8, "w": 8,
"x": 0, "x": 0,
"y": 16 "y": 22
}, },
"targets": [ "targets": [
{ {
@ -1475,7 +1707,7 @@
"h": 9, "h": 9,
"w": 8, "w": 8,
"x": 8, "x": 8,
"y": 16 "y": 22
}, },
"targets": [ "targets": [
{ {
@ -1544,7 +1776,7 @@
"h": 9, "h": 9,
"w": 8, "w": 8,
"x": 16, "x": 16,
"y": 16 "y": 22
}, },
"targets": [ "targets": [
{ {
@ -1613,7 +1845,7 @@
"h": 12, "h": 12,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 32 "y": 38
}, },
"targets": [ "targets": [
{ {
@ -1660,7 +1892,7 @@
"h": 12, "h": 12,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 32 "y": 38
}, },
"targets": [ "targets": [
{ {
@ -1707,7 +1939,7 @@
"h": 10, "h": 10,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 44 "y": 50
}, },
"targets": [ "targets": [
{ {
@ -1744,7 +1976,7 @@
"h": 10, "h": 10,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 44 "y": 50
}, },
"targets": [ "targets": [
{ {
@ -1781,7 +2013,7 @@
"h": 10, "h": 10,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 54 "y": 60
}, },
"targets": [ "targets": [
{ {
@ -1832,7 +2064,7 @@
"h": 10, "h": 10,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 54 "y": 60
}, },
"targets": [ "targets": [
{ {
@ -1913,7 +2145,7 @@
"h": 7, "h": 7,
"w": 8, "w": 8,
"x": 0, "x": 0,
"y": 25 "y": 31
}, },
"targets": [ "targets": [
{ {
@ -1957,7 +2189,7 @@
"h": 7, "h": 7,
"w": 8, "w": 8,
"x": 8, "x": 8,
"y": 25 "y": 31
}, },
"targets": [ "targets": [
{ {
@ -2001,7 +2233,7 @@
"h": 7, "h": 7,
"w": 8, "w": 8,
"x": 16, "x": 16,
"y": 25 "y": 31
}, },
"targets": [ "targets": [
{ {
@ -2045,7 +2277,7 @@
"h": 16, "h": 16,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 64 "y": 70
}, },
"targets": [ "targets": [
{ {
@ -2093,7 +2325,7 @@
"h": 16, "h": 16,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 64 "y": 70
}, },
"targets": [ "targets": [
{ {

View File

@ -804,7 +804,7 @@ data:
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 2, "h": 3,
"w": 5, "w": 5,
"x": 0, "x": 0,
"y": 8 "y": 8
@ -871,7 +871,7 @@ data:
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 2, "h": 3,
"w": 5, "w": 5,
"x": 10, "x": 10,
"y": 8 "y": 8
@ -976,7 +976,7 @@ data:
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 2, "h": 3,
"w": 5, "w": 5,
"x": 5, "x": 5,
"y": 8 "y": 8
@ -1052,7 +1052,7 @@ data:
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 2, "h": 3,
"w": 5, "w": 5,
"x": 15, "x": 15,
"y": 8 "y": 8
@ -1128,10 +1128,10 @@ data:
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 6, "h": 5,
"w": 6, "w": 6,
"x": 0, "x": 0,
"y": 10 "y": 11
}, },
"targets": [ "targets": [
{ {
@ -1203,10 +1203,10 @@ data:
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 6, "h": 5,
"w": 6, "w": 6,
"x": 6, "x": 6,
"y": 10 "y": 11
}, },
"targets": [ "targets": [
{ {
@ -1278,10 +1278,10 @@ data:
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 6, "h": 5,
"w": 6, "w": 6,
"x": 12, "x": 12,
"y": 10 "y": 11
}, },
"targets": [ "targets": [
{ {
@ -1345,10 +1345,10 @@ data:
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 6, "h": 5,
"w": 6, "w": 6,
"x": 18, "x": 18,
"y": 10 "y": 11
}, },
"targets": [ "targets": [
{ {
@ -1403,6 +1403,238 @@ data:
} }
] ]
}, },
{
"id": 40,
"type": "bargauge",
"title": "One-off Job Pods (age hours)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 4,
"x": 0,
"y": 16
},
"targets": [
{
"expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
"refId": "A",
"legendFormat": "{{namespace}}/{{pod}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "h",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 6
},
{
"color": "orange",
"value": 24
},
{
"color": "red",
"value": 48
}
]
}
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 8
}
}
]
},
{
"id": 41,
"type": "timeseries",
"title": "Ariadne Attempts vs Failures (1h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 4,
"y": 16
},
"targets": [
{
"expr": "sum(increase(ariadne_task_runs_total[1h]))",
"refId": "A",
"legendFormat": "Attempts"
},
{
"expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
"refId": "B",
"legendFormat": "Failures"
}
],
"fieldConfig": {
"defaults": {
"unit": "none"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 42,
"type": "timeseries",
"title": "Ariadne Test Success Rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 12,
"y": 16
},
"targets": [
{
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 43,
"type": "bargauge",
"title": "Tests with Failures (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 4,
"x": 20,
"y": 16
},
"targets": [
{
"expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))",
"refId": "A",
"legendFormat": "{{result}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 5
},
{
"color": "red",
"value": 10
}
]
}
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
}
]
},
{ {
"id": 11, "id": 11,
"type": "piechart", "type": "piechart",
@ -1415,7 +1647,7 @@ data:
"h": 9, "h": 9,
"w": 8, "w": 8,
"x": 0, "x": 0,
"y": 16 "y": 22
}, },
"targets": [ "targets": [
{ {
@ -1484,7 +1716,7 @@ data:
"h": 9, "h": 9,
"w": 8, "w": 8,
"x": 8, "x": 8,
"y": 16 "y": 22
}, },
"targets": [ "targets": [
{ {
@ -1553,7 +1785,7 @@ data:
"h": 9, "h": 9,
"w": 8, "w": 8,
"x": 16, "x": 16,
"y": 16 "y": 22
}, },
"targets": [ "targets": [
{ {
@ -1622,7 +1854,7 @@ data:
"h": 12, "h": 12,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 32 "y": 38
}, },
"targets": [ "targets": [
{ {
@ -1669,7 +1901,7 @@ data:
"h": 12, "h": 12,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 32 "y": 38
}, },
"targets": [ "targets": [
{ {
@ -1716,7 +1948,7 @@ data:
"h": 10, "h": 10,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 44 "y": 50
}, },
"targets": [ "targets": [
{ {
@ -1753,7 +1985,7 @@ data:
"h": 10, "h": 10,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 44 "y": 50
}, },
"targets": [ "targets": [
{ {
@ -1790,7 +2022,7 @@ data:
"h": 10, "h": 10,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 54 "y": 60
}, },
"targets": [ "targets": [
{ {
@ -1841,7 +2073,7 @@ data:
"h": 10, "h": 10,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 54 "y": 60
}, },
"targets": [ "targets": [
{ {
@ -1922,7 +2154,7 @@ data:
"h": 7, "h": 7,
"w": 8, "w": 8,
"x": 0, "x": 0,
"y": 25 "y": 31
}, },
"targets": [ "targets": [
{ {
@ -1966,7 +2198,7 @@ data:
"h": 7, "h": 7,
"w": 8, "w": 8,
"x": 8, "x": 8,
"y": 25 "y": 31
}, },
"targets": [ "targets": [
{ {
@ -2010,7 +2242,7 @@ data:
"h": 7, "h": 7,
"w": 8, "w": 8,
"x": 16, "x": 16,
"y": 25 "y": 31
}, },
"targets": [ "targets": [
{ {
@ -2054,7 +2286,7 @@ data:
"h": 16, "h": 16,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 64 "y": 70
}, },
"targets": [ "targets": [
{ {
@ -2102,7 +2334,7 @@ data:
"h": 16, "h": 16,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 64 "y": 70
}, },
"targets": [ "targets": [
{ {

View File

@ -471,14 +471,14 @@ spec:
editable: true editable: true
options: options:
path: /var/lib/grafana/dashboards/mail path: /var/lib/grafana/dashboards/mail
- name: testing - name: jobs
orgId: 1 orgId: 1
folder: Atlas Internal folder: Atlas Internal
type: file type: file
disableDeletion: false disableDeletion: false
editable: true editable: true
options: options:
path: /var/lib/grafana/dashboards/testing path: /var/lib/grafana/dashboards/jobs
dashboardsConfigMaps: dashboardsConfigMaps:
overview: grafana-dashboard-overview overview: grafana-dashboard-overview
overview-public: grafana-dashboard-overview overview-public: grafana-dashboard-overview
@ -488,7 +488,7 @@ spec:
gpu: grafana-dashboard-gpu gpu: grafana-dashboard-gpu
network: grafana-dashboard-network network: grafana-dashboard-network
mail: grafana-dashboard-mail mail: grafana-dashboard-mail
testing: grafana-dashboard-testing jobs: grafana-dashboard-jobs
extraConfigmapMounts: extraConfigmapMounts:
- name: grafana-folders - name: grafana-folders
mountPath: /etc/grafana/provisioning/folders mountPath: /etc/grafana/provisioning/folders

View File

@ -14,7 +14,7 @@ resources:
- grafana-dashboard-network.yaml - grafana-dashboard-network.yaml
- grafana-dashboard-gpu.yaml - grafana-dashboard-gpu.yaml
- grafana-dashboard-mail.yaml - grafana-dashboard-mail.yaml
- grafana-dashboard-testing.yaml - grafana-dashboard-jobs.yaml
- dcgm-exporter.yaml - dcgm-exporter.yaml
- jetson-tegrastats-exporter.yaml - jetson-tegrastats-exporter.yaml
- postmark-exporter-service.yaml - postmark-exporter-service.yaml