# services/monitoring/grafana-dashboard-jobs.yaml apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboard-jobs labels: grafana_dashboard: "1" data: atlas-jobs.json: | { "uid": "atlas-jobs", "title": "Atlas Jobs", "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, "type": "bargauge", "title": "Ariadne Task Errors (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, "w": 6, "x": 0, "y": 0 }, "targets": [ { "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", "refId": "A", "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "orange", "value": 3 }, { "color": "red", "value": 5 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "desc" } } ] }, { "id": 2, "type": "timeseries", "title": "Ariadne Attempts vs Failures (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, "w": 12, "x": 6, "y": 0 }, "targets": [ { "expr": "sum(increase(ariadne_task_runs_total[1h]))", "refId": "A", "legendFormat": "Attempts" }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", "refId": "B", "legendFormat": "Failures" } ], "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } }, { "id": 3, "type": "bargauge", "title": "One-off Job Pods (age hours)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, "w": 6, "x": 18, "y": 0 }, "targets": [ { "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "h", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 6 }, { "color": "orange", "value": 24 }, { "color": "red", "value": 48 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "desc" } }, { "id": "limit", "options": { "limit": 12 } } ] }, { "id": 4, "type": "stat", "title": "Glue Jobs Stale (>36h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 4, "x": 0, "y": 7 }, "targets": [ { "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "orange", "value": 2 }, { "color": "red", "value": 3 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 5, "type": "stat", "title": "Glue Jobs Missing Success", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 4, "x": 4, "y": 7 }, "targets": [ { "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 6, "type": "stat", "title": "Glue Jobs Suspended", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 4, "x": 8, "y": 7 }, "targets": [ { "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 7, "type": "stat", "title": "Ariadne Task Errors (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 4, "x": 12, "y": 7 }, "targets": [ { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 8, "type": "stat", "title": "Ariadne Task Errors (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 4, "x": 16, "y": 7 }, "targets": [ { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 9, "type": "stat", "title": "Ariadne Task Runs (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 4, "x": 20, "y": 7 }, "targets": [ { "expr": "sum(increase(ariadne_task_runs_total[1h]))", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 10, "type": "bargauge", "title": "Ariadne Schedule Last Error (hours ago)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 11 }, "targets": [ { "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "h", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 1 }, { "color": "yellow", "value": 6 }, { "color": "green", "value": 24 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "desc" } } ] }, { "id": 11, "type": "bargauge", "title": "Ariadne Schedule Last Success (hours ago)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 11 }, "targets": [ { "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "h", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 6 }, { "color": "orange", "value": 24 }, { "color": "red", "value": 48 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "desc" } } ] }, { "id": 12, "type": "bargauge", "title": "Glue Jobs Last Success (hours ago)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 19 }, "targets": [ { "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "h", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 6 }, { "color": "orange", "value": 24 }, { "color": "red", "value": 48 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "desc" } } ] }, { "id": 13, "type": "bargauge", "title": "Glue Jobs Last Schedule (hours ago)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 19 }, "targets": [ { "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "h", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 6 }, { "color": "orange", "value": 24 }, { "color": "red", "value": 48 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "desc" } } ] }, { "id": 14, "type": "bargauge", "title": "Ariadne Task Errors (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 27 }, "targets": [ { "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))", "refId": "A", "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "orange", "value": 3 }, { "color": "red", "value": 5 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "desc" } } ] }, { "id": 15, "type": "bargauge", "title": "Ariadne Task Errors (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 27 }, "targets": [ { "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))", "refId": "A", "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "orange", "value": 3 }, { "color": "red", "value": 5 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "desc" } } ] }, { "id": 16, "type": "bargauge", "title": "Ariadne Access Requests", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 8, "x": 0, "y": 35 }, "targets": [ { "expr": "ariadne_access_requests_total", "refId": "A", "legendFormat": "{{status}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 50 }, { "color": "orange", "value": 70 }, { "color": "red", "value": 85 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "desc" } } ] }, { "id": 17, "type": "stat", "title": "Ariadne CI Coverage (%)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 4, "x": 8, "y": 35 }, "targets": [ { "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}", "refId": "A", "legendFormat": "{{branch}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "percent", "custom": { "displayMode": "auto" }, "decimals": 1 }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 18, "type": "table", "title": "Ariadne CI Tests (latest)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 35 }, "targets": [ { "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}", "refId": "A", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", "custom": { "filterable": true } }, "overrides": [] }, "options": { "showHeader": true, "columnFilters": false }, "transformations": [ { "id": "labelsToFields", "options": {} }, { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "desc" } } ] } ], "time": { "from": "now-7d", "to": "now" }, "annotations": { "list": [] }, "schemaVersion": 39, "style": "dark", "tags": [ "atlas", "jobs", "glue" ] }