From 8b35ab02922bb80b5386dfcfa76f3c7b8fdc3d32 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 13:37:36 -0300 Subject: [PATCH] monitoring: refresh jobs dashboards --- scripts/dashboards_render_atlas.py | 330 ++++-- services/maintenance/ariadne-deployment.yaml | 8 +- .../{atlas-testing.json => atlas-jobs.json} | 1034 +++++++++------- .../monitoring/dashboards/atlas-overview.json | 284 ++++- ...sting.yaml => grafana-dashboard-jobs.yaml} | 1040 ++++++++++------- .../grafana-dashboard-overview.yaml | 284 ++++- services/monitoring/helmrelease.yaml | 6 +- services/monitoring/kustomization.yaml | 2 +- 8 files changed, 1946 insertions(+), 1042 deletions(-) rename services/monitoring/dashboards/{atlas-testing.json => atlas-jobs.json} (84%) rename services/monitoring/{grafana-dashboard-testing.yaml => grafana-dashboard-jobs.yaml} (84%) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 6eaafb4..1235a0a 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -337,16 +337,39 @@ GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))" GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' +ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))' +ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))' ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))' ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))' +ARIADNE_TASK_ATTEMPTS_1H = 'sum(increase(ariadne_task_runs_total[1h]))' +ARIADNE_TASK_FAILURES_1H = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' +ARIADNE_TEST_SUCCESS_RATE = ( + "100 * " + 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[1h])) ' + "/ clamp_min(" + 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[1h])), 1)' +) +ARIADNE_TEST_FAILURES_24H = ( + 'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))' +) +ONEOFF_JOB_OWNER = ( + 'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")' +) +ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})' +ONEOFF_JOB_POD_AGE_HOURS = ( + '((time() - kube_pod_start_time{pod!=""}) / 3600) ' + f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} ' + '* on(namespace,pod) group_left(phase) ' + 'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})' +) GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" @@ -798,6 +821,15 @@ def build_overview(): {"color": "red", "value": 3}, ], } + age_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 6}, + {"color": "orange", "value": 24}, + {"color": "red", "value": 48}, + ], + } row1_stats = [ { @@ -1000,7 +1032,7 @@ def build_overview(): 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 2, "w": 5, "x": 0, "y": 8}, + {"h": 3, "w": 5, "x": 0, "y": 8}, unit="none", links=link_to("atlas-mail"), ) @@ -1011,7 +1043,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 2, "w": 5, "x": 10, "y": 8}, + "gridPos": {"h": 3, "w": 5, "x": 10, "y": 8}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1057,7 +1089,7 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 2, "w": 5, "x": 5, "y": 8}, + {"h": 3, "w": 5, "x": 5, "y": 8}, unit="percent", thresholds=mail_success_thresholds, decimals=1, @@ -1069,7 +1101,7 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 2, "w": 5, "x": 15, "y": 8}, + {"h": 3, "w": 5, "x": 15, "y": 8}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, @@ -1089,13 +1121,76 @@ def build_overview(): panel_id, title, expr, - {"h": 6, "w": 6, "x": 6 * idx, "y": 10}, + {"h": 5, "w": 6, "x": 6 * idx, "y": 11}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, links=link_to("atlas-storage"), ) ) + panels.append( + bargauge_panel( + 40, + "One-off Job Pods (age hours)", + ONEOFF_JOB_POD_AGE_HOURS, + {"h": 6, "w": 4, "x": 0, "y": 16}, + unit="h", + instant=True, + legend="{{namespace}}/{{pod}}", + thresholds=age_thresholds, + limit=8, + ) + ) + panels.append( + { + "id": 41, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", + "datasource": PROM_DS, + "gridPos": {"h": 6, "w": 8, "x": 4, "y": 16}, + "targets": [ + {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"}, + ], + "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "options": { + "legend": {"displayMode": "table", "placement": "right"}, + "tooltip": {"mode": "multi"}, + }, + } + ) + panels.append( + timeseries_panel( + 42, + "Ariadne Test Success Rate", + ARIADNE_TEST_SUCCESS_RATE, + {"h": 6, "w": 8, "x": 12, "y": 16}, + unit="percent", + legend=None, + legend_display="list", + ) + ) + panels.append( + bargauge_panel( + 43, + "Tests with Failures (24h)", + ARIADNE_TEST_FAILURES_24H, + {"h": 6, "w": 4, "x": 20, "y": 16}, + unit="none", + instant=True, + legend="{{result}}", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 5}, + {"color": "red", "value": 10}, + ], + }, + ) + ) + cpu_scope = "$namespace_scope_cpu" gpu_scope = "$namespace_scope_gpu" ram_scope = "$namespace_scope_ram" @@ -1105,7 +1200,7 @@ def build_overview(): 11, "Namespace CPU Share", namespace_cpu_share_expr(cpu_scope), - {"h": 9, "w": 8, "x": 0, "y": 16}, + {"h": 9, "w": 8, "x": 0, "y": 22}, links=namespace_scope_links("namespace_scope_cpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1115,7 +1210,7 @@ def build_overview(): 12, "Namespace GPU Share", namespace_gpu_share_expr(gpu_scope), - {"h": 9, "w": 8, "x": 8, "y": 16}, + {"h": 9, "w": 8, "x": 8, "y": 22}, links=namespace_scope_links("namespace_scope_gpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1125,7 +1220,7 @@ def build_overview(): 13, "Namespace RAM Share", namespace_ram_share_expr(ram_scope), - {"h": 9, "w": 8, "x": 16, "y": 16}, + {"h": 9, "w": 8, "x": 16, "y": 22}, links=namespace_scope_links("namespace_scope_ram"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1137,7 +1232,7 @@ def build_overview(): 14, "Worker Node CPU", node_cpu_expr(worker_filter), - {"h": 12, "w": 12, "x": 0, "y": 32}, + {"h": 12, "w": 12, "x": 0, "y": 38}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1151,7 +1246,7 @@ def build_overview(): 15, "Worker Node RAM", node_mem_expr(worker_filter), - {"h": 12, "w": 12, "x": 12, "y": 32}, + {"h": 12, "w": 12, "x": 12, "y": 38}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1166,7 +1261,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 0, "y": 44}, + {"h": 10, "w": 12, "x": 0, "y": 50}, unit="percent", legend="{{node}}", legend_display="table", @@ -1178,7 +1273,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 12, "y": 44}, + {"h": 10, "w": 12, "x": 12, "y": 50}, unit="percent", legend="{{node}}", legend_display="table", @@ -1191,7 +1286,7 @@ def build_overview(): 28, "Node Pod Share", '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', - {"h": 10, "w": 12, "x": 0, "y": 54}, + {"h": 10, "w": 12, "x": 0, "y": 60}, ) ) panels.append( @@ -1199,7 +1294,7 @@ def build_overview(): 29, "Top Nodes by Pod Count", 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', - {"h": 10, "w": 12, "x": 12, "y": 54}, + {"h": 10, "w": 12, "x": 12, "y": 60}, unit="none", limit=12, decimals=0, @@ -1221,7 +1316,7 @@ def build_overview(): 18, "Cluster Ingress Throughput", NET_INGRESS_EXPR, - {"h": 7, "w": 8, "x": 0, "y": 25}, + {"h": 7, "w": 8, "x": 0, "y": 31}, unit="Bps", legend="Ingress (Traefik)", legend_display="list", @@ -1234,7 +1329,7 @@ def build_overview(): 19, "Cluster Egress Throughput", NET_EGRESS_EXPR, - {"h": 7, "w": 8, "x": 8, "y": 25}, + {"h": 7, "w": 8, "x": 8, "y": 31}, unit="Bps", legend="Egress (Traefik)", legend_display="list", @@ -1247,7 +1342,7 @@ def build_overview(): 20, "Intra-Cluster Throughput", NET_INTERNAL_EXPR, - {"h": 7, "w": 8, "x": 16, "y": 25}, + {"h": 7, "w": 8, "x": 16, "y": 31}, unit="Bps", legend="Internal traffic", legend_display="list", @@ -1261,7 +1356,7 @@ def build_overview(): 21, "Root Filesystem Usage", root_usage_expr(), - {"h": 16, "w": 12, "x": 0, "y": 64}, + {"h": 16, "w": 12, "x": 0, "y": 70}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1276,7 +1371,7 @@ def build_overview(): 22, "Nodes Closest to Full Root Disks", f"topk(12, {root_usage_expr()})", - {"h": 16, "w": 12, "x": 12, "y": 64}, + {"h": 16, "w": 12, "x": 12, "y": 70}, unit="percent", thresholds=PERCENT_THRESHOLDS, links=link_to("atlas-storage"), @@ -2171,7 +2266,7 @@ def build_mail_dashboard(): } -def build_testing_dashboard(): +def build_jobs_dashboard(): panels = [] age_thresholds = { "mode": "absolute", @@ -2192,12 +2287,65 @@ def build_testing_dashboard(): ], } + task_error_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 3}, + {"color": "red", "value": 5}, + ], + } + + panels.append( + bargauge_panel( + 1, + "Ariadne Task Errors (24h)", + ARIADNE_TASK_ERRORS_24H, + {"h": 7, "w": 6, "x": 0, "y": 0}, + unit="none", + instant=True, + legend="{{task}}", + thresholds=task_error_thresholds, + ) + ) + panels.append( + { + "id": 2, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", + "datasource": PROM_DS, + "gridPos": {"h": 7, "w": 12, "x": 6, "y": 0}, + "targets": [ + {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"}, + ], + "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "options": { + "legend": {"displayMode": "table", "placement": "right"}, + "tooltip": {"mode": "multi"}, + }, + } + ) + panels.append( + bargauge_panel( + 3, + "One-off Job Pods (age hours)", + ONEOFF_JOB_POD_AGE_HOURS, + {"h": 7, "w": 6, "x": 18, "y": 0}, + unit="h", + instant=True, + legend="{{namespace}}/{{pod}}", + thresholds=age_thresholds, + limit=12, + ) + ) panels.append( stat_panel( - 1, + 4, "Glue Jobs Stale (>36h)", GLUE_STALE_COUNT, - {"h": 4, "w": 6, "x": 0, "y": 0}, + {"h": 4, "w": 4, "x": 0, "y": 7}, unit="none", thresholds={ "mode": "absolute", @@ -2212,99 +2360,47 @@ def build_testing_dashboard(): ) panels.append( stat_panel( - 2, + 5, "Glue Jobs Missing Success", GLUE_MISSING_COUNT, - {"h": 4, "w": 4, "x": 4, "y": 0}, - unit="none", - ) - ) - panels.append( - stat_panel( - 3, - "Glue Jobs Suspended", - GLUE_SUSPENDED_COUNT, - {"h": 4, "w": 4, "x": 8, "y": 0}, - unit="none", - ) - ) - panels.append( - stat_panel( - 4, - "Ariadne Task Errors (1h)", - ARIADNE_TASK_ERRORS_1H_TOTAL, - {"h": 4, "w": 4, "x": 12, "y": 0}, - unit="none", - ) - ) - panels.append( - stat_panel( - 5, - "Ariadne Task Errors (24h)", - ARIADNE_TASK_ERRORS_24H_TOTAL, - {"h": 4, "w": 4, "x": 16, "y": 0}, + {"h": 4, "w": 4, "x": 4, "y": 7}, unit="none", ) ) panels.append( stat_panel( 6, - "Ariadne Task Runs (1h)", - ARIADNE_TASK_RUNS_1H_TOTAL, - {"h": 4, "w": 4, "x": 20, "y": 0}, + "Glue Jobs Suspended", + GLUE_SUSPENDED_COUNT, + {"h": 4, "w": 4, "x": 8, "y": 7}, unit="none", ) ) panels.append( - timeseries_panel( + stat_panel( 7, - "Ariadne Task Runs vs Errors (1h)", - ARIADNE_TASK_RUNS_BY_STATUS_1H, - {"h": 6, "w": 24, "x": 0, "y": 4}, + "Ariadne Task Errors (1h)", + ARIADNE_TASK_ERRORS_1H_TOTAL, + {"h": 4, "w": 4, "x": 12, "y": 7}, unit="none", - legend="{{status}}", - legend_display="table", - legend_placement="right", ) ) panels.append( - bargauge_panel( + stat_panel( 8, "Ariadne Task Errors (24h)", - ARIADNE_TASK_ERRORS_24H, - {"h": 8, "w": 12, "x": 0, "y": 10}, + ARIADNE_TASK_ERRORS_24H_TOTAL, + {"h": 4, "w": 4, "x": 16, "y": 7}, unit="none", - instant=True, - legend="{{task}}", - thresholds={ - "mode": "absolute", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 1}, - {"color": "orange", "value": 3}, - {"color": "red", "value": 5}, - ], - }, ) ) panels.append( - bargauge_panel( + stat_panel( 9, - "Ariadne Task Success (24h)", - ARIADNE_TASK_SUCCESS_24H, - {"h": 8, "w": 12, "x": 12, "y": 10}, + "Ariadne Task Runs (1h)", + ARIADNE_TASK_RUNS_1H_TOTAL, + {"h": 4, "w": 4, "x": 20, "y": 7}, unit="none", - instant=True, - legend="{{task}}", - thresholds={ - "mode": "absolute", - "steps": [ - {"color": "red", "value": None}, - {"color": "orange", "value": 1}, - {"color": "yellow", "value": 5}, - {"color": "green", "value": 10}, - ], - }, ) ) panels.append( @@ -2312,7 +2408,7 @@ def build_testing_dashboard(): 10, "Ariadne Schedule Last Error (hours ago)", ARIADNE_SCHEDULE_LAST_ERROR_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 18}, + {"h": 8, "w": 12, "x": 0, "y": 11}, unit="h", instant=True, legend="{{task}}", @@ -2324,7 +2420,7 @@ def build_testing_dashboard(): 11, "Ariadne Schedule Last Success (hours ago)", ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 18}, + {"h": 8, "w": 12, "x": 12, "y": 11}, unit="h", instant=True, legend="{{task}}", @@ -2336,7 +2432,7 @@ def build_testing_dashboard(): 12, "Glue Jobs Last Success (hours ago)", GLUE_LAST_SUCCESS_AGE_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 26}, + {"h": 8, "w": 12, "x": 0, "y": 19}, unit="h", instant=True, legend="{{namespace}}/{{cronjob}}", @@ -2348,7 +2444,7 @@ def build_testing_dashboard(): 13, "Glue Jobs Last Schedule (hours ago)", GLUE_LAST_SCHEDULE_AGE_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 26}, + {"h": 8, "w": 12, "x": 12, "y": 19}, unit="h", instant=True, legend="{{namespace}}/{{cronjob}}", @@ -2358,9 +2454,33 @@ def build_testing_dashboard(): panels.append( bargauge_panel( 14, + "Ariadne Task Errors (1h)", + ARIADNE_TASK_ERRORS_1H, + {"h": 8, "w": 12, "x": 0, "y": 27}, + unit="none", + instant=True, + legend="{{task}}", + thresholds=task_error_thresholds, + ) + ) + panels.append( + bargauge_panel( + 15, + "Ariadne Task Errors (30d)", + ARIADNE_TASK_ERRORS_30D, + {"h": 8, "w": 12, "x": 12, "y": 27}, + unit="none", + instant=True, + legend="{{task}}", + thresholds=task_error_thresholds, + ) + ) + panels.append( + bargauge_panel( + 16, "Ariadne Access Requests", ARIADNE_ACCESS_REQUESTS, - {"h": 6, "w": 8, "x": 0, "y": 34}, + {"h": 6, "w": 8, "x": 0, "y": 35}, unit="none", instant=True, legend="{{status}}", @@ -2368,10 +2488,10 @@ def build_testing_dashboard(): ) panels.append( stat_panel( - 15, + 17, "Ariadne CI Coverage (%)", ARIADNE_CI_COVERAGE, - {"h": 6, "w": 4, "x": 8, "y": 34}, + {"h": 6, "w": 4, "x": 8, "y": 35}, unit="percent", decimals=1, instant=True, @@ -2380,10 +2500,10 @@ def build_testing_dashboard(): ) panels.append( table_panel( - 16, + 18, "Ariadne CI Tests (latest)", ARIADNE_CI_TESTS, - {"h": 6, "w": 12, "x": 12, "y": 34}, + {"h": 6, "w": 12, "x": 12, "y": 35}, unit="none", transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], instant=True, @@ -2391,8 +2511,8 @@ def build_testing_dashboard(): ) return { - "uid": "atlas-testing", - "title": "Atlas Testing", + "uid": "atlas-jobs", + "title": "Atlas Jobs", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, @@ -2400,7 +2520,7 @@ def build_testing_dashboard(): "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", - "tags": ["atlas", "testing"], + "tags": ["atlas", "jobs", "glue"], } @@ -2497,9 +2617,9 @@ DASHBOARDS = { "builder": build_mail_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml", }, - "atlas-testing": { - "builder": build_testing_dashboard, - "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml", + "atlas-jobs": { + "builder": build_jobs_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml", }, "atlas-gpu": { "builder": build_gpu_dashboard, diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 069f388..01e940c 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -270,7 +270,7 @@ spec: - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE value: "30 4 * * *" - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC - value: "*/15 * * * *" + value: "0 * * * *" - name: ARIADNE_SCHEDULE_WGER_USER_SYNC value: "0 5 * * *" - name: ARIADNE_SCHEDULE_WGER_ADMIN @@ -286,11 +286,11 @@ spec: - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER value: "30 4 * * 0" - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH - value: "*/15 * * * *" + value: "0 * * * *" - name: ARIADNE_SCHEDULE_VAULT_OIDC - value: "*/15 * * * *" + value: "0 * * * *" - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME - value: "*/1 * * * *" + value: "*/5 * * * *" - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE value: "*/30 * * * *" - name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-jobs.json similarity index 84% rename from services/monitoring/dashboards/atlas-testing.json rename to services/monitoring/dashboards/atlas-jobs.json index 420abf2..76e21f0 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -1,416 +1,11 @@ { - "uid": "atlas-testing", - "title": "Atlas Testing", + "uid": "atlas-jobs", + "title": "Atlas Jobs", "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, - "type": "stat", - "title": "Glue Jobs Stale (>36h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 2, - "type": "stat", - "title": "Glue Jobs Missing Success", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 0 - }, - "targets": [ - { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 3, - "type": "stat", - "title": "Glue Jobs Suspended", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 8, - "y": 0 - }, - "targets": [ - { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 4, - "type": "stat", - "title": "Ariadne Task Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 12, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 5, - "type": "stat", - "title": "Ariadne Task Errors (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 16, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 6, - "type": "stat", - "title": "Ariadne Task Runs (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 20, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 7, - "type": "timeseries", - "title": "Ariadne Task Runs vs Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 24, - "x": 0, - "y": 4 - }, - "targets": [ - { - "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))", - "refId": "A", - "legendFormat": "{{status}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 8, "type": "bargauge", "title": "Ariadne Task Errors (24h)", "datasource": { @@ -418,10 +13,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, + "h": 7, + "w": 6, "x": 0, - "y": 10 + "y": 0 }, "targets": [ { @@ -484,50 +79,92 @@ ] }, { - "id": 9, - "type": "bargauge", - "title": "Ariadne Task Success (24h)", + "id": 2, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 10 + "x": 6, + "y": 0 }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))", + "expr": "sum(increase(ariadne_task_runs_total[1h]))", "refId": "A", - "legendFormat": "{{task}}", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 3, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", "instant": true } ], "fieldConfig": { "defaults": { - "unit": "none", + "unit": "h", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { - "color": "red", + "color": "green", "value": null }, - { - "color": "orange", - "value": 1 - }, { "color": "yellow", - "value": 5 + "value": 6 }, { - "color": "green", - "value": 10 + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 } ] } @@ -554,9 +191,383 @@ ], "order": "desc" } + }, + { + "id": "limit", + "options": { + "limit": 12 + } } ] }, + { + "id": 4, + "type": "stat", + "title": "Glue Jobs Stale (>36h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 7 + }, + "targets": [ + { + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Glue Jobs Missing Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 7 + }, + "targets": [ + { + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Glue Jobs Suspended", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 7 + }, + "targets": [ + { + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 8, + "type": "stat", + "title": "Ariadne Task Errors (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 9, + "type": "stat", + "title": "Ariadne Task Runs (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, { "id": 10, "type": "bargauge", @@ -569,7 +580,7 @@ "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 11 }, "targets": [ { @@ -643,7 +654,7 @@ "h": 8, "w": 12, "x": 12, - "y": 18 + "y": 11 }, "targets": [ { @@ -717,7 +728,7 @@ "h": 8, "w": 12, "x": 0, - "y": 26 + "y": 19 }, "targets": [ { @@ -791,7 +802,7 @@ "h": 8, "w": 12, "x": 12, - "y": 26 + "y": 19 }, "targets": [ { @@ -856,6 +867,154 @@ { "id": 14, "type": "bargauge", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, + "type": "bargauge", + "title": "Ariadne Task Errors (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 16, + "type": "bargauge", "title": "Ariadne Access Requests", "datasource": { "type": "prometheus", @@ -865,7 +1024,7 @@ "h": 6, "w": 8, "x": 0, - "y": 34 + "y": 35 }, "targets": [ { @@ -928,7 +1087,7 @@ ] }, { - "id": 15, + "id": 17, "type": "stat", "title": "Ariadne CI Coverage (%)", "datasource": { @@ -939,7 +1098,7 @@ "h": 6, "w": 4, "x": 8, - "y": 34 + "y": 35 }, "targets": [ { @@ -991,7 +1150,7 @@ } }, { - "id": 16, + "id": 18, "type": "table", "title": "Ariadne CI Tests (latest)", "datasource": { @@ -1002,7 +1161,7 @@ "h": 6, "w": 12, "x": 12, - "y": 34 + "y": 35 }, "targets": [ { @@ -1052,6 +1211,7 @@ "style": "dark", "tags": [ "atlas", - "testing" + "jobs", + "glue" ] } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index c5f30d1..c3ff327 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -795,7 +795,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 0, "y": 8 @@ -862,7 +862,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 10, "y": 8 @@ -967,7 +967,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 5, "y": 8 @@ -1043,7 +1043,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 15, "y": 8 @@ -1119,10 +1119,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 0, - "y": 10 + "y": 11 }, "targets": [ { @@ -1194,10 +1194,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 6, - "y": 10 + "y": 11 }, "targets": [ { @@ -1269,10 +1269,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 12, - "y": 10 + "y": 11 }, "targets": [ { @@ -1336,10 +1336,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 18, - "y": 10 + "y": 11 }, "targets": [ { @@ -1394,6 +1394,238 @@ } ] }, + { + "id": 40, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 16 + }, + "targets": [ + { + "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 8 + } + } + ] + }, + { + "id": 41, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 4, + "y": 16 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 42, + "type": "timeseries", + "title": "Ariadne Test Success Rate", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 12, + "y": 16 + }, + "targets": [ + { + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 43, + "type": "bargauge", + "title": "Tests with Failures (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 20, + "y": 16 + }, + "targets": [ + { + "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))", + "refId": "A", + "legendFormat": "{{result}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, { "id": 11, "type": "piechart", @@ -1406,7 +1638,7 @@ "h": 9, "w": 8, "x": 0, - "y": 16 + "y": 22 }, "targets": [ { @@ -1475,7 +1707,7 @@ "h": 9, "w": 8, "x": 8, - "y": 16 + "y": 22 }, "targets": [ { @@ -1544,7 +1776,7 @@ "h": 9, "w": 8, "x": 16, - "y": 16 + "y": 22 }, "targets": [ { @@ -1613,7 +1845,7 @@ "h": 12, "w": 12, "x": 0, - "y": 32 + "y": 38 }, "targets": [ { @@ -1660,7 +1892,7 @@ "h": 12, "w": 12, "x": 12, - "y": 32 + "y": 38 }, "targets": [ { @@ -1707,7 +1939,7 @@ "h": 10, "w": 12, "x": 0, - "y": 44 + "y": 50 }, "targets": [ { @@ -1744,7 +1976,7 @@ "h": 10, "w": 12, "x": 12, - "y": 44 + "y": 50 }, "targets": [ { @@ -1781,7 +2013,7 @@ "h": 10, "w": 12, "x": 0, - "y": 54 + "y": 60 }, "targets": [ { @@ -1832,7 +2064,7 @@ "h": 10, "w": 12, "x": 12, - "y": 54 + "y": 60 }, "targets": [ { @@ -1913,7 +2145,7 @@ "h": 7, "w": 8, "x": 0, - "y": 25 + "y": 31 }, "targets": [ { @@ -1957,7 +2189,7 @@ "h": 7, "w": 8, "x": 8, - "y": 25 + "y": 31 }, "targets": [ { @@ -2001,7 +2233,7 @@ "h": 7, "w": 8, "x": 16, - "y": 25 + "y": 31 }, "targets": [ { @@ -2045,7 +2277,7 @@ "h": 16, "w": 12, "x": 0, - "y": 64 + "y": 70 }, "targets": [ { @@ -2093,7 +2325,7 @@ "h": 16, "w": 12, "x": 12, - "y": 64 + "y": 70 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-jobs.yaml similarity index 84% rename from services/monitoring/grafana-dashboard-testing.yaml rename to services/monitoring/grafana-dashboard-jobs.yaml index 52b2836..19e0d4e 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -1,425 +1,20 @@ -# services/monitoring/grafana-dashboard-testing.yaml +# services/monitoring/grafana-dashboard-jobs.yaml apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboard-testing + name: grafana-dashboard-jobs labels: grafana_dashboard: "1" data: - atlas-testing.json: | + atlas-jobs.json: | { - "uid": "atlas-testing", - "title": "Atlas Testing", + "uid": "atlas-jobs", + "title": "Atlas Jobs", "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, - "type": "stat", - "title": "Glue Jobs Stale (>36h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 2, - "type": "stat", - "title": "Glue Jobs Missing Success", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 0 - }, - "targets": [ - { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 3, - "type": "stat", - "title": "Glue Jobs Suspended", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 8, - "y": 0 - }, - "targets": [ - { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 4, - "type": "stat", - "title": "Ariadne Task Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 12, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 5, - "type": "stat", - "title": "Ariadne Task Errors (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 16, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 6, - "type": "stat", - "title": "Ariadne Task Runs (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 20, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 7, - "type": "timeseries", - "title": "Ariadne Task Runs vs Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 24, - "x": 0, - "y": 4 - }, - "targets": [ - { - "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))", - "refId": "A", - "legendFormat": "{{status}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 8, "type": "bargauge", "title": "Ariadne Task Errors (24h)", "datasource": { @@ -427,10 +22,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, + "h": 7, + "w": 6, "x": 0, - "y": 10 + "y": 0 }, "targets": [ { @@ -493,50 +88,92 @@ data: ] }, { - "id": 9, - "type": "bargauge", - "title": "Ariadne Task Success (24h)", + "id": 2, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 10 + "x": 6, + "y": 0 }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))", + "expr": "sum(increase(ariadne_task_runs_total[1h]))", "refId": "A", - "legendFormat": "{{task}}", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 3, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", "instant": true } ], "fieldConfig": { "defaults": { - "unit": "none", + "unit": "h", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { - "color": "red", + "color": "green", "value": null }, - { - "color": "orange", - "value": 1 - }, { "color": "yellow", - "value": 5 + "value": 6 }, { - "color": "green", - "value": 10 + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 } ] } @@ -563,9 +200,383 @@ data: ], "order": "desc" } + }, + { + "id": "limit", + "options": { + "limit": 12 + } } ] }, + { + "id": 4, + "type": "stat", + "title": "Glue Jobs Stale (>36h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 7 + }, + "targets": [ + { + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Glue Jobs Missing Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 7 + }, + "targets": [ + { + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Glue Jobs Suspended", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 7 + }, + "targets": [ + { + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 8, + "type": "stat", + "title": "Ariadne Task Errors (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 9, + "type": "stat", + "title": "Ariadne Task Runs (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, { "id": 10, "type": "bargauge", @@ -578,7 +589,7 @@ data: "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 11 }, "targets": [ { @@ -652,7 +663,7 @@ data: "h": 8, "w": 12, "x": 12, - "y": 18 + "y": 11 }, "targets": [ { @@ -726,7 +737,7 @@ data: "h": 8, "w": 12, "x": 0, - "y": 26 + "y": 19 }, "targets": [ { @@ -800,7 +811,7 @@ data: "h": 8, "w": 12, "x": 12, - "y": 26 + "y": 19 }, "targets": [ { @@ -865,6 +876,154 @@ data: { "id": 14, "type": "bargauge", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, + "type": "bargauge", + "title": "Ariadne Task Errors (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 16, + "type": "bargauge", "title": "Ariadne Access Requests", "datasource": { "type": "prometheus", @@ -874,7 +1033,7 @@ data: "h": 6, "w": 8, "x": 0, - "y": 34 + "y": 35 }, "targets": [ { @@ -937,7 +1096,7 @@ data: ] }, { - "id": 15, + "id": 17, "type": "stat", "title": "Ariadne CI Coverage (%)", "datasource": { @@ -948,7 +1107,7 @@ data: "h": 6, "w": 4, "x": 8, - "y": 34 + "y": 35 }, "targets": [ { @@ -1000,7 +1159,7 @@ data: } }, { - "id": 16, + "id": 18, "type": "table", "title": "Ariadne CI Tests (latest)", "datasource": { @@ -1011,7 +1170,7 @@ data: "h": 6, "w": 12, "x": 12, - "y": 34 + "y": 35 }, "targets": [ { @@ -1061,6 +1220,7 @@ data: "style": "dark", "tags": [ "atlas", - "testing" + "jobs", + "glue" ] } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 8ad7523..45969cc 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -804,7 +804,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 0, "y": 8 @@ -871,7 +871,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 10, "y": 8 @@ -976,7 +976,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 5, "y": 8 @@ -1052,7 +1052,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 15, "y": 8 @@ -1128,10 +1128,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 0, - "y": 10 + "y": 11 }, "targets": [ { @@ -1203,10 +1203,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 6, - "y": 10 + "y": 11 }, "targets": [ { @@ -1278,10 +1278,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 12, - "y": 10 + "y": 11 }, "targets": [ { @@ -1345,10 +1345,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 18, - "y": 10 + "y": 11 }, "targets": [ { @@ -1403,6 +1403,238 @@ data: } ] }, + { + "id": 40, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 16 + }, + "targets": [ + { + "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 8 + } + } + ] + }, + { + "id": 41, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 4, + "y": 16 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 42, + "type": "timeseries", + "title": "Ariadne Test Success Rate", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 12, + "y": 16 + }, + "targets": [ + { + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 43, + "type": "bargauge", + "title": "Tests with Failures (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 20, + "y": 16 + }, + "targets": [ + { + "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))", + "refId": "A", + "legendFormat": "{{result}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, { "id": 11, "type": "piechart", @@ -1415,7 +1647,7 @@ data: "h": 9, "w": 8, "x": 0, - "y": 16 + "y": 22 }, "targets": [ { @@ -1484,7 +1716,7 @@ data: "h": 9, "w": 8, "x": 8, - "y": 16 + "y": 22 }, "targets": [ { @@ -1553,7 +1785,7 @@ data: "h": 9, "w": 8, "x": 16, - "y": 16 + "y": 22 }, "targets": [ { @@ -1622,7 +1854,7 @@ data: "h": 12, "w": 12, "x": 0, - "y": 32 + "y": 38 }, "targets": [ { @@ -1669,7 +1901,7 @@ data: "h": 12, "w": 12, "x": 12, - "y": 32 + "y": 38 }, "targets": [ { @@ -1716,7 +1948,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 44 + "y": 50 }, "targets": [ { @@ -1753,7 +1985,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 44 + "y": 50 }, "targets": [ { @@ -1790,7 +2022,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 54 + "y": 60 }, "targets": [ { @@ -1841,7 +2073,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 54 + "y": 60 }, "targets": [ { @@ -1922,7 +2154,7 @@ data: "h": 7, "w": 8, "x": 0, - "y": 25 + "y": 31 }, "targets": [ { @@ -1966,7 +2198,7 @@ data: "h": 7, "w": 8, "x": 8, - "y": 25 + "y": 31 }, "targets": [ { @@ -2010,7 +2242,7 @@ data: "h": 7, "w": 8, "x": 16, - "y": 25 + "y": 31 }, "targets": [ { @@ -2054,7 +2286,7 @@ data: "h": 16, "w": 12, "x": 0, - "y": 64 + "y": 70 }, "targets": [ { @@ -2102,7 +2334,7 @@ data: "h": 16, "w": 12, "x": 12, - "y": 64 + "y": 70 }, "targets": [ { diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 02bc482..ac24f8a 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -471,14 +471,14 @@ spec: editable: true options: path: /var/lib/grafana/dashboards/mail - - name: testing + - name: jobs orgId: 1 folder: Atlas Internal type: file disableDeletion: false editable: true options: - path: /var/lib/grafana/dashboards/testing + path: /var/lib/grafana/dashboards/jobs dashboardsConfigMaps: overview: grafana-dashboard-overview overview-public: grafana-dashboard-overview @@ -488,7 +488,7 @@ spec: gpu: grafana-dashboard-gpu network: grafana-dashboard-network mail: grafana-dashboard-mail - testing: grafana-dashboard-testing + jobs: grafana-dashboard-jobs extraConfigmapMounts: - name: grafana-folders mountPath: /etc/grafana/provisioning/folders diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 86ab826..5953039 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -14,7 +14,7 @@ resources: - grafana-dashboard-network.yaml - grafana-dashboard-gpu.yaml - grafana-dashboard-mail.yaml - - grafana-dashboard-testing.yaml + - grafana-dashboard-jobs.yaml - dcgm-exporter.yaml - jetson-tegrastats-exporter.yaml - postmark-exporter-service.yaml