diff --git a/infrastructure/core/coredns-custom.yaml b/infrastructure/core/coredns-custom.yaml index ea28def..8aeff14 100644 --- a/infrastructure/core/coredns-custom.yaml +++ b/infrastructure/core/coredns-custom.yaml @@ -24,7 +24,7 @@ data: 192.168.22.9 live.bstein.dev 192.168.22.9 logs.bstein.dev 192.168.22.9 longhorn.bstein.dev - 192.168.22.9 mail.bstein.dev + 192.168.22.4 mail.bstein.dev 192.168.22.9 matrix.live.bstein.dev 192.168.22.9 metrics.bstein.dev 192.168.22.9 monero.bstein.dev diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 4aa2908..179536e 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -332,6 +332,8 @@ GLUE_MISSING = f"({GLUE_JOBS} unless {GLUE_LAST_SUCCESS})" GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})" GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})" GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))" +GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" +GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" @@ -1054,17 +1056,6 @@ def build_overview(): links=link_to("atlas-mail"), ) ) - panels.append( - stat_panel( - 34, - "Glue Jobs Stale", - GLUE_STALE_COUNT, - {"h": 2, "w": 4, "x": 20, "y": 8}, - unit="none", - thresholds=count_thresholds, - links=link_to("atlas-glue"), - ) - ) storage_panels = [ (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), @@ -1120,6 +1111,50 @@ def build_overview(): ) ) + panels.append( + { + "id": 34, + "type": "row", + "title": "Glue + Automation", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 25}, + "collapsed": False, + "panels": [], + } + ) + panels.append( + stat_panel( + 35, + "Glue Jobs Stale", + GLUE_STALE_COUNT, + {"h": 6, "w": 8, "x": 0, "y": 26}, + unit="none", + thresholds=count_thresholds, + links=link_to("atlas-glue"), + ) + ) + panels.append( + stat_panel( + 36, + "Glue Jobs Missing Success", + GLUE_MISSING_COUNT, + {"h": 6, "w": 8, "x": 8, "y": 26}, + unit="none", + thresholds=count_thresholds, + links=link_to("atlas-glue"), + ) + ) + panels.append( + stat_panel( + 37, + "Glue Jobs Suspended", + GLUE_SUSPENDED_COUNT, + {"h": 6, "w": 8, "x": 16, "y": 26}, + unit="none", + thresholds=count_thresholds, + links=link_to("atlas-glue"), + ) + ) + worker_filter = f"{WORKER_REGEX}" panels.append( timeseries_panel( @@ -2186,7 +2221,7 @@ def build_glue_dashboard(): table_panel( 2, "Glue Jobs Missing Success", - GLUE_MISSING, + GLUE_MISSING_ACTIVE, {"h": 4, "w": 6, "x": 6, "y": 0}, unit="none", transformations=sort_desc, diff --git a/services/monitoring/dashboards/atlas-glue.json b/services/monitoring/dashboards/atlas-glue.json index c836d18..732d36c 100644 --- a/services/monitoring/dashboards/atlas-glue.json +++ b/services/monitoring/dashboards/atlas-glue.json @@ -88,7 +88,7 @@ }, "targets": [ { - "expr": "(kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})", + "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)", "refId": "A", "instant": true } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index e1c5d3a..8732391 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1110,81 +1110,6 @@ } ] }, - { - "id": 34, - "type": "stat", - "title": "Glue Jobs Stale", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 2, - "w": 4, - "x": 20, - "y": 8 - }, - "targets": [ - { - "expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-glue dashboard", - "url": "/d/atlas-glue", - "targetBlank": true - } - ] - }, { "id": 23, "type": "stat", @@ -1676,6 +1601,244 @@ ], "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, + { + "id": 34, + "type": "row", + "title": "Glue + Automation", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "collapsed": false, + "panels": [] + }, + { + "id": 35, + "type": "stat", + "title": "Glue Jobs Stale", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 26 + }, + "targets": [ + { + "expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-glue dashboard", + "url": "/d/atlas-glue", + "targetBlank": true + } + ] + }, + { + "id": 36, + "type": "stat", + "title": "Glue Jobs Missing Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 26 + }, + "targets": [ + { + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-glue dashboard", + "url": "/d/atlas-glue", + "targetBlank": true + } + ] + }, + { + "id": 37, + "type": "stat", + "title": "Glue Jobs Suspended", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 26 + }, + "targets": [ + { + "expr": "sum(kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-glue dashboard", + "url": "/d/atlas-glue", + "targetBlank": true + } + ] + }, { "id": 14, "type": "timeseries", diff --git a/services/monitoring/grafana-dashboard-glue.yaml b/services/monitoring/grafana-dashboard-glue.yaml index 0f8c0a1..7aeec74 100644 --- a/services/monitoring/grafana-dashboard-glue.yaml +++ b/services/monitoring/grafana-dashboard-glue.yaml @@ -97,7 +97,7 @@ data: }, "targets": [ { - "expr": "(kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})", + "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)", "refId": "A", "instant": true } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 78d5566..0f6cd72 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1119,81 +1119,6 @@ data: } ] }, - { - "id": 34, - "type": "stat", - "title": "Glue Jobs Stale", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 2, - "w": 4, - "x": 20, - "y": 8 - }, - "targets": [ - { - "expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-glue dashboard", - "url": "/d/atlas-glue", - "targetBlank": true - } - ] - }, { "id": 23, "type": "stat", @@ -1685,6 +1610,244 @@ data: ], "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, + { + "id": 34, + "type": "row", + "title": "Glue + Automation", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "collapsed": false, + "panels": [] + }, + { + "id": 35, + "type": "stat", + "title": "Glue Jobs Stale", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 26 + }, + "targets": [ + { + "expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-glue dashboard", + "url": "/d/atlas-glue", + "targetBlank": true + } + ] + }, + { + "id": 36, + "type": "stat", + "title": "Glue Jobs Missing Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 26 + }, + "targets": [ + { + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-glue dashboard", + "url": "/d/atlas-glue", + "targetBlank": true + } + ] + }, + { + "id": 37, + "type": "stat", + "title": "Glue Jobs Suspended", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 26 + }, + "targets": [ + { + "expr": "sum(kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-glue dashboard", + "url": "/d/atlas-glue", + "targetBlank": true + } + ] + }, { "id": 14, "type": "timeseries",