From 343d41ecc74d64415859721a6981efeb0ee57ce2 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 18 Jan 2026 02:50:07 -0300 Subject: [PATCH] monitoring: add glue dashboard and tag cronjobs --- scripts/dashboards_render_atlas.py | 150 +++++++- .../vaultwarden-cred-sync-cronjob.yaml | 2 + services/comms/guest-name-job.yaml | 2 + services/comms/pin-othrys-job.yaml | 4 +- services/comms/reset-othrys-room-job.yaml | 4 +- services/comms/seed-othrys-room.yaml | 4 +- .../finance/firefly-user-sync-cronjob.yaml | 2 + .../health/wger-admin-ensure-cronjob.yaml | 2 + services/health/wger-user-sync-cronjob.yaml | 2 + services/mailu/mailu-sync-cronjob.yaml | 4 +- .../monitoring/dashboards/atlas-glue.json | 339 +++++++++++++++++ services/monitoring/dashboards/atlas-gpu.json | 30 +- .../monitoring/dashboards/atlas-nodes.json | 2 +- .../monitoring/dashboards/atlas-overview.json | 133 +++++-- .../monitoring/dashboards/atlas-pods.json | 2 +- .../monitoring/grafana-dashboard-glue.yaml | 348 ++++++++++++++++++ .../monitoring/grafana-dashboard-gpu.yaml | 30 +- .../monitoring/grafana-dashboard-nodes.yaml | 2 +- .../grafana-dashboard-overview.yaml | 133 +++++-- .../monitoring/grafana-dashboard-pods.yaml | 2 +- services/monitoring/kustomization.yaml | 1 + services/nextcloud-mail-sync/cronjob.yaml | 2 + services/outline/deployment.yaml | 2 +- services/planka/deployment.yaml | 2 +- services/vault/k8s-auth-config-cronjob.yaml | 2 + services/vault/oidc-config-cronjob.yaml | 2 + 26 files changed, 1095 insertions(+), 113 deletions(-) create mode 100644 services/monitoring/dashboards/atlas-glue.json create mode 100644 services/monitoring/grafana-dashboard-glue.yaml diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index a5abfe8..4aa2908 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -85,19 +85,17 @@ WORKER_TOTAL = len(WORKER_NODES) CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" # Namespaces considered infrastructure (excluded from workload counts) -INFRA_NAMESPACES = [ - "kube-system", - "longhorn-system", - "metallb-system", +INFRA_PATTERNS = [ + "kube-.*", + ".*-system", + "traefik", "monitoring", "logging", "cert-manager", - "flux-system", - "traefik", "maintenance", "postgres", ] -INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$" +INFRA_REGEX = f"^({'|'.join(INFRA_PATTERNS)})$" # Namespaces allowed on control plane without counting as workloads CP_ALLOWED_NS = INFRA_REGEX LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" @@ -319,6 +317,21 @@ NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"' NAMESPACE_SCOPE_ALL = 'namespace=~".*"' NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"' NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"] +GLUE_LABEL = 'label_atlas_bstein_dev_glue="true"' +GLUE_JOBS = f"kube_cronjob_labels{{{GLUE_LABEL}}}" +GLUE_LAST_SUCCESS = f"kube_cronjob_status_last_successful_time{{{GLUE_LABEL}}}" +GLUE_LAST_SCHEDULE = f"kube_cronjob_status_last_schedule_time{{{GLUE_LABEL}}}" +GLUE_SUSPENDED = f"kube_cronjob_spec_suspend{{{GLUE_LABEL}}} == 1" +GLUE_LAST_SUCCESS_AGE = f"(time() - {GLUE_LAST_SUCCESS})" +GLUE_LAST_SCHEDULE_AGE = f"(time() - {GLUE_LAST_SCHEDULE})" +GLUE_LAST_SUCCESS_AGE_HOURS = f"({GLUE_LAST_SUCCESS_AGE}) / 3600" +GLUE_LAST_SCHEDULE_AGE_HOURS = f"({GLUE_LAST_SCHEDULE_AGE}) / 3600" +GLUE_STALE_WINDOW_SEC = 36 * 3600 +GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})" +GLUE_MISSING = f"({GLUE_JOBS} unless {GLUE_LAST_SUCCESS})" +GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})" +GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})" +GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))" GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" @@ -965,7 +978,7 @@ def build_overview(): 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 2, "w": 6, "x": 0, "y": 8}, + {"h": 2, "w": 5, "x": 0, "y": 8}, unit="none", links=link_to("atlas-mail"), ) @@ -976,7 +989,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 2, "w": 6, "x": 12, "y": 8}, + "gridPos": {"h": 2, "w": 5, "x": 10, "y": 8}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1022,7 +1035,7 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 2, "w": 6, "x": 6, "y": 8}, + {"h": 2, "w": 5, "x": 5, "y": 8}, unit="percent", thresholds=mail_success_thresholds, decimals=1, @@ -1034,13 +1047,24 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 2, "w": 6, "x": 18, "y": 8}, + {"h": 2, "w": 5, "x": 15, "y": 8}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, links=link_to("atlas-mail"), ) ) + panels.append( + stat_panel( + 34, + "Glue Jobs Stale", + GLUE_STALE_COUNT, + {"h": 2, "w": 4, "x": 20, "y": 8}, + unit="none", + thresholds=count_thresholds, + links=link_to("atlas-glue"), + ) + ) storage_panels = [ (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), @@ -1072,7 +1096,7 @@ def build_overview(): namespace_cpu_share_expr(cpu_scope), {"h": 9, "w": 8, "x": 0, "y": 16}, links=namespace_scope_links("namespace_scope_cpu"), - description="Values are normalized within the selected scope; use panel links to switch scope.", + description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) ) panels.append( @@ -1082,7 +1106,7 @@ def build_overview(): namespace_gpu_share_expr(gpu_scope), {"h": 9, "w": 8, "x": 8, "y": 16}, links=namespace_scope_links("namespace_scope_gpu"), - description="Values are normalized within the selected scope; use panel links to switch scope.", + description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) ) panels.append( @@ -1092,7 +1116,7 @@ def build_overview(): namespace_ram_share_expr(ram_scope), {"h": 9, "w": 8, "x": 16, "y": 16}, links=namespace_scope_links("namespace_scope_ram"), - description="Values are normalized within the selected scope; use panel links to switch scope.", + description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) ) @@ -2136,6 +2160,98 @@ def build_mail_dashboard(): } +def build_glue_dashboard(): + panels = [] + sort_desc = [{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}] + + panels.append( + stat_panel( + 1, + "Glue Jobs Stale (>36h)", + GLUE_STALE_COUNT, + {"h": 4, "w": 6, "x": 0, "y": 0}, + unit="none", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 2}, + {"color": "red", "value": 3}, + ], + }, + ) + ) + panels.append( + table_panel( + 2, + "Glue Jobs Missing Success", + GLUE_MISSING, + {"h": 4, "w": 6, "x": 6, "y": 0}, + unit="none", + transformations=sort_desc, + instant=True, + ) + ) + panels.append( + table_panel( + 3, + "Glue Jobs Suspended", + f"kube_cronjob_spec_suspend{{{GLUE_LABEL}}} == 1", + {"h": 4, "w": 6, "x": 12, "y": 0}, + unit="none", + transformations=sort_desc, + instant=True, + ) + ) + panels.append( + table_panel( + 4, + "Glue Jobs Active Runs", + f"kube_cronjob_status_active{{{GLUE_LABEL}}}", + {"h": 4, "w": 6, "x": 18, "y": 0}, + unit="none", + transformations=sort_desc, + instant=True, + ) + ) + panels.append( + table_panel( + 5, + "Glue Jobs Last Success (hours ago)", + GLUE_LAST_SUCCESS_AGE_HOURS, + {"h": 8, "w": 12, "x": 0, "y": 4}, + unit="h", + transformations=sort_desc, + instant=True, + ) + ) + panels.append( + table_panel( + 6, + "Glue Jobs Last Schedule (hours ago)", + GLUE_LAST_SCHEDULE_AGE_HOURS, + {"h": 8, "w": 12, "x": 12, "y": 4}, + unit="h", + transformations=sort_desc, + instant=True, + ) + ) + + return { + "uid": "atlas-glue", + "title": "Atlas Glue", + "folderUid": PRIVATE_FOLDER, + "editable": True, + "panels": panels, + "time": {"from": "now-7d", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "glue"], + } + + def build_gpu_dashboard(): panels = [] gpu_scope = "$namespace_scope_gpu" @@ -2146,7 +2262,7 @@ def build_gpu_dashboard(): namespace_gpu_share_expr(gpu_scope), {"h": 8, "w": 12, "x": 0, "y": 0}, links=namespace_scope_links("namespace_scope_gpu"), - description="Values are normalized within the selected scope; use panel links to switch scope.", + description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) ) panels.append( @@ -2229,6 +2345,10 @@ DASHBOARDS = { "builder": build_mail_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml", }, + "atlas-glue": { + "builder": build_glue_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-glue.yaml", + }, "atlas-gpu": { "builder": build_gpu_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml", diff --git a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml index 1960d11..86eeaf1 100644 --- a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml +++ b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml @@ -4,6 +4,8 @@ kind: CronJob metadata: name: vaultwarden-cred-sync namespace: bstein-dev-home + labels: + atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" concurrencyPolicy: Forbid diff --git a/services/comms/guest-name-job.yaml b/services/comms/guest-name-job.yaml index 0797168..21a8af5 100644 --- a/services/comms/guest-name-job.yaml +++ b/services/comms/guest-name-job.yaml @@ -4,6 +4,8 @@ kind: CronJob metadata: name: guest-name-randomizer namespace: comms + labels: + atlas.bstein.dev/glue: "true" spec: schedule: "*/1 * * * *" suspend: false diff --git a/services/comms/pin-othrys-job.yaml b/services/comms/pin-othrys-job.yaml index f25c18e..2b29ca3 100644 --- a/services/comms/pin-othrys-job.yaml +++ b/services/comms/pin-othrys-job.yaml @@ -4,6 +4,8 @@ kind: CronJob metadata: name: pin-othrys-invite namespace: comms + labels: + atlas.bstein.dev/glue: "true" spec: schedule: "*/30 * * * *" suspend: true @@ -164,4 +166,4 @@ spec: - name: vault-scripts configMap: name: comms-vault-env - defaultMode: 0555 \ No newline at end of file + defaultMode: 0555 diff --git a/services/comms/reset-othrys-room-job.yaml b/services/comms/reset-othrys-room-job.yaml index c0d941b..ae8585a 100644 --- a/services/comms/reset-othrys-room-job.yaml +++ b/services/comms/reset-othrys-room-job.yaml @@ -4,6 +4,8 @@ kind: CronJob metadata: name: othrys-room-reset namespace: comms + labels: + atlas.bstein.dev/glue: "true" spec: schedule: "0 0 1 1 *" suspend: true @@ -307,4 +309,4 @@ spec: - name: vault-scripts configMap: name: comms-vault-env - defaultMode: 0555 \ No newline at end of file + defaultMode: 0555 diff --git a/services/comms/seed-othrys-room.yaml b/services/comms/seed-othrys-room.yaml index ce87c85..804d330 100644 --- a/services/comms/seed-othrys-room.yaml +++ b/services/comms/seed-othrys-room.yaml @@ -4,6 +4,8 @@ kind: CronJob metadata: name: seed-othrys-room namespace: comms + labels: + atlas.bstein.dev/glue: "true" spec: schedule: "*/10 * * * *" suspend: true @@ -180,4 +182,4 @@ spec: - name: vault-scripts configMap: name: comms-vault-env - defaultMode: 0555 \ No newline at end of file + defaultMode: 0555 diff --git a/services/finance/firefly-user-sync-cronjob.yaml b/services/finance/firefly-user-sync-cronjob.yaml index dab7f31..aeadfad 100644 --- a/services/finance/firefly-user-sync-cronjob.yaml +++ b/services/finance/firefly-user-sync-cronjob.yaml @@ -4,6 +4,8 @@ kind: CronJob metadata: name: firefly-user-sync namespace: finance + labels: + atlas.bstein.dev/glue: "true" spec: schedule: "0 6 * * *" suspend: true diff --git a/services/health/wger-admin-ensure-cronjob.yaml b/services/health/wger-admin-ensure-cronjob.yaml index fc18283..db178a3 100644 --- a/services/health/wger-admin-ensure-cronjob.yaml +++ b/services/health/wger-admin-ensure-cronjob.yaml @@ -4,6 +4,8 @@ kind: CronJob metadata: name: wger-admin-ensure namespace: health + labels: + atlas.bstein.dev/glue: "true" spec: schedule: "15 3 * * *" concurrencyPolicy: Forbid diff --git a/services/health/wger-user-sync-cronjob.yaml b/services/health/wger-user-sync-cronjob.yaml index 1645256..de2dbb9 100644 --- a/services/health/wger-user-sync-cronjob.yaml +++ b/services/health/wger-user-sync-cronjob.yaml @@ -4,6 +4,8 @@ kind: CronJob metadata: name: wger-user-sync namespace: health + labels: + atlas.bstein.dev/glue: "true" spec: schedule: "0 5 * * *" suspend: true diff --git a/services/mailu/mailu-sync-cronjob.yaml b/services/mailu/mailu-sync-cronjob.yaml index 9e0e35c..57cbd0a 100644 --- a/services/mailu/mailu-sync-cronjob.yaml +++ b/services/mailu/mailu-sync-cronjob.yaml @@ -4,6 +4,8 @@ kind: CronJob metadata: name: mailu-sync-nightly namespace: mailu-mailserver + labels: + atlas.bstein.dev/glue: "true" spec: schedule: "30 4 * * *" concurrencyPolicy: Forbid @@ -79,4 +81,4 @@ spec: - name: vault-scripts configMap: name: mailu-vault-env - defaultMode: 0555 \ No newline at end of file + defaultMode: 0555 diff --git a/services/monitoring/dashboards/atlas-glue.json b/services/monitoring/dashboards/atlas-glue.json new file mode 100644 index 0000000..c836d18 --- /dev/null +++ b/services/monitoring/dashboards/atlas-glue.json @@ -0,0 +1,339 @@ +{ + "uid": "atlas-glue", + "title": "Atlas Glue", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Glue Jobs Stale (>36h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "table", + "title": "Glue Jobs Missing Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "(kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 3, + "type": "table", + "title": "Glue Jobs Suspended", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 4, + "type": "table", + "title": "Glue Jobs Active Runs", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "kube_cronjob_status_active{label_atlas_bstein_dev_glue=\"true\"}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 5, + "type": "table", + "title": "Glue Jobs Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "targets": [ + { + "expr": "((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 6, + "type": "table", + "title": "Glue Jobs Last Schedule (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "targets": [ + { + "expr": "((time() - kube_cronjob_status_last_schedule_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + } + ], + "time": { + "from": "now-7d", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "glue" + ] +} diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index fb1b216..af8a1c5 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -57,7 +57,7 @@ "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { @@ -67,11 +67,11 @@ }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], - "description": "Values are normalized within the selected scope; use panel links to switch scope." + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, { "id": 2, @@ -207,16 +207,16 @@ "name": "namespace_scope_cpu", "label": "CPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { @@ -226,7 +226,7 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], @@ -241,16 +241,16 @@ "name": "namespace_scope_gpu", "label": "GPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { @@ -260,7 +260,7 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], @@ -275,16 +275,16 @@ "name": "namespace_scope_ram", "label": "RAM namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { @@ -294,7 +294,7 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index 0bfd639..2d60042 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -142,7 +142,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index a113d22..e1c5d3a 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -76,7 +76,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"}) or on() vector(0)", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)", "refId": "A" } ], @@ -796,7 +796,7 @@ }, "gridPos": { "h": 2, - "w": 6, + "w": 5, "x": 0, "y": 8 }, @@ -863,8 +863,8 @@ }, "gridPos": { "h": 2, - "w": 6, - "x": 12, + "w": 5, + "x": 10, "y": 8 }, "targets": [ @@ -968,8 +968,8 @@ }, "gridPos": { "h": 2, - "w": 6, - "x": 6, + "w": 5, + "x": 5, "y": 8 }, "targets": [ @@ -1044,8 +1044,8 @@ }, "gridPos": { "h": 2, - "w": 6, - "x": 18, + "w": 5, + "x": 15, "y": 8 }, "targets": [ @@ -1110,6 +1110,81 @@ } ] }, + { + "id": 34, + "type": "stat", + "title": "Glue Jobs Stale", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 20, + "y": 8 + }, + "targets": [ + { + "expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-glue dashboard", + "url": "/d/atlas-glue", + "targetBlank": true + } + ] + }, { "id": 23, "type": "stat", @@ -1447,7 +1522,7 @@ "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { @@ -1457,11 +1532,11 @@ }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], - "description": "Values are normalized within the selected scope; use panel links to switch scope." + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, { "id": 12, @@ -1516,7 +1591,7 @@ "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { @@ -1526,11 +1601,11 @@ }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], - "description": "Values are normalized within the selected scope; use panel links to switch scope." + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, { "id": 13, @@ -1585,7 +1660,7 @@ "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22", "targetBlank": false }, { @@ -1595,11 +1670,11 @@ }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22", "targetBlank": false } ], - "description": "Values are normalized within the selected scope; use panel links to switch scope." + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, { "id": 14, @@ -2174,16 +2249,16 @@ "name": "namespace_scope_cpu", "label": "CPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { @@ -2193,7 +2268,7 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], @@ -2208,16 +2283,16 @@ "name": "namespace_scope_gpu", "label": "GPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { @@ -2227,7 +2302,7 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], @@ -2242,16 +2317,16 @@ "name": "namespace_scope_ram", "label": "RAM namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { @@ -2261,7 +2336,7 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index ff2dbdd..adab84b 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -200,7 +200,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-glue.yaml b/services/monitoring/grafana-dashboard-glue.yaml new file mode 100644 index 0000000..0f8c0a1 --- /dev/null +++ b/services/monitoring/grafana-dashboard-glue.yaml @@ -0,0 +1,348 @@ +# services/monitoring/grafana-dashboard-glue.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-glue + labels: + grafana_dashboard: "1" +data: + atlas-glue.json: | + { + "uid": "atlas-glue", + "title": "Atlas Glue", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Glue Jobs Stale (>36h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "table", + "title": "Glue Jobs Missing Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "(kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 3, + "type": "table", + "title": "Glue Jobs Suspended", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 4, + "type": "table", + "title": "Glue Jobs Active Runs", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "kube_cronjob_status_active{label_atlas_bstein_dev_glue=\"true\"}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 5, + "type": "table", + "title": "Glue Jobs Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "targets": [ + { + "expr": "((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 6, + "type": "table", + "title": "Glue Jobs Last Schedule (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "targets": [ + { + "expr": "((time() - kube_cronjob_status_last_schedule_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + } + ], + "time": { + "from": "now-7d", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "glue" + ] + } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 49b5d39..d7950f2 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -66,7 +66,7 @@ data: "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { @@ -76,11 +76,11 @@ data: }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], - "description": "Values are normalized within the selected scope; use panel links to switch scope." + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, { "id": 2, @@ -216,16 +216,16 @@ data: "name": "namespace_scope_cpu", "label": "CPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { @@ -235,7 +235,7 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], @@ -250,16 +250,16 @@ data: "name": "namespace_scope_gpu", "label": "GPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { @@ -269,7 +269,7 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], @@ -284,16 +284,16 @@ data: "name": "namespace_scope_ram", "label": "RAM namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { @@ -303,7 +303,7 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index 5e02c18..f0f1982 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -151,7 +151,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index e627658..78d5566 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -85,7 +85,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"}) or on() vector(0)", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)", "refId": "A" } ], @@ -805,7 +805,7 @@ data: }, "gridPos": { "h": 2, - "w": 6, + "w": 5, "x": 0, "y": 8 }, @@ -872,8 +872,8 @@ data: }, "gridPos": { "h": 2, - "w": 6, - "x": 12, + "w": 5, + "x": 10, "y": 8 }, "targets": [ @@ -977,8 +977,8 @@ data: }, "gridPos": { "h": 2, - "w": 6, - "x": 6, + "w": 5, + "x": 5, "y": 8 }, "targets": [ @@ -1053,8 +1053,8 @@ data: }, "gridPos": { "h": 2, - "w": 6, - "x": 18, + "w": 5, + "x": 15, "y": 8 }, "targets": [ @@ -1119,6 +1119,81 @@ data: } ] }, + { + "id": 34, + "type": "stat", + "title": "Glue Jobs Stale", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 20, + "y": 8 + }, + "targets": [ + { + "expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-glue dashboard", + "url": "/d/atlas-glue", + "targetBlank": true + } + ] + }, { "id": 23, "type": "stat", @@ -1456,7 +1531,7 @@ data: "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { @@ -1466,11 +1541,11 @@ data: }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], - "description": "Values are normalized within the selected scope; use panel links to switch scope." + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, { "id": 12, @@ -1525,7 +1600,7 @@ data: "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { @@ -1535,11 +1610,11 @@ data: }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], - "description": "Values are normalized within the selected scope; use panel links to switch scope." + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, { "id": 13, @@ -1594,7 +1669,7 @@ data: "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22", "targetBlank": false }, { @@ -1604,11 +1679,11 @@ data: }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Ccert-manager%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22", "targetBlank": false } ], - "description": "Values are normalized within the selected scope; use panel links to switch scope." + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, { "id": 14, @@ -2183,16 +2258,16 @@ data: "name": "namespace_scope_cpu", "label": "CPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { @@ -2202,7 +2277,7 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], @@ -2217,16 +2292,16 @@ data: "name": "namespace_scope_gpu", "label": "GPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { @@ -2236,7 +2311,7 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], @@ -2251,16 +2326,16 @@ data: "name": "namespace_scope_ram", "label": "RAM namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { @@ -2270,7 +2345,7 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"", + "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 5ea8343..f537d4c 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -209,7 +209,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|cert-manager|flux-system|traefik|maintenance|postgres)$\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})", "refId": "A" } ], diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index b12556e..69ad326 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -14,6 +14,7 @@ resources: - grafana-dashboard-network.yaml - grafana-dashboard-gpu.yaml - grafana-dashboard-mail.yaml + - grafana-dashboard-glue.yaml - dcgm-exporter.yaml - jetson-tegrastats-exporter.yaml - postmark-exporter-service.yaml diff --git a/services/nextcloud-mail-sync/cronjob.yaml b/services/nextcloud-mail-sync/cronjob.yaml index 728bf98..cb42d49 100644 --- a/services/nextcloud-mail-sync/cronjob.yaml +++ b/services/nextcloud-mail-sync/cronjob.yaml @@ -4,6 +4,8 @@ kind: CronJob metadata: name: nextcloud-mail-sync namespace: nextcloud + labels: + atlas.bstein.dev/glue: "true" spec: schedule: "0 5 * * *" concurrencyPolicy: Forbid diff --git a/services/outline/deployment.yaml b/services/outline/deployment.yaml index 471d185..cca3964 100644 --- a/services/outline/deployment.yaml +++ b/services/outline/deployment.yaml @@ -41,9 +41,9 @@ spec: export OIDC_USERINFO_URI="{{ .Data.data.OIDC_USERINFO_URI }}" {{ end }} {{ with secret "kv/data/atlas/outline/outline-smtp" }} - export SMTP_FROM_EMAIL="{{ .Data.data.SMTP_FROM_EMAIL }}" export SMTP_HOST="{{ .Data.data.SMTP_HOST }}" {{ end }} + export SMTP_FROM_EMAIL="no-reply-outline@bstein.dev" {{ with secret "kv/data/atlas/shared/postmark-relay" }} export SMTP_USERNAME="{{ index .Data.data "relay-username" }}" export SMTP_PASSWORD="{{ index .Data.data "relay-password" }}" diff --git a/services/planka/deployment.yaml b/services/planka/deployment.yaml index afda7fd..155ac80 100644 --- a/services/planka/deployment.yaml +++ b/services/planka/deployment.yaml @@ -41,12 +41,12 @@ spec: export OIDC_USE_OAUTH_CALLBACK="{{ .Data.data.OIDC_USE_OAUTH_CALLBACK }}" {{ end }} {{ with secret "kv/data/atlas/planka/planka-smtp" }} - export SMTP_FROM="{{ .Data.data.SMTP_FROM }}" export SMTP_HOST="{{ .Data.data.SMTP_HOST }}" export SMTP_PORT="{{ .Data.data.SMTP_PORT }}" export SMTP_SECURE="{{ .Data.data.SMTP_SECURE }}" export SMTP_TLS_REJECT_UNAUTHORIZED="{{ .Data.data.SMTP_TLS_REJECT_UNAUTHORIZED }}" {{ end }} + export SMTP_FROM="no-reply-planka@bstein.dev" {{ with secret "kv/data/atlas/shared/postmark-relay" }} export SMTP_USER="{{ index .Data.data "relay-username" }}" export SMTP_PASSWORD="{{ index .Data.data "relay-password" }}" diff --git a/services/vault/k8s-auth-config-cronjob.yaml b/services/vault/k8s-auth-config-cronjob.yaml index 6a644df..29e8e80 100644 --- a/services/vault/k8s-auth-config-cronjob.yaml +++ b/services/vault/k8s-auth-config-cronjob.yaml @@ -4,6 +4,8 @@ kind: CronJob metadata: name: vault-k8s-auth-config namespace: vault + labels: + atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" concurrencyPolicy: Forbid diff --git a/services/vault/oidc-config-cronjob.yaml b/services/vault/oidc-config-cronjob.yaml index 3140073..013c9f3 100644 --- a/services/vault/oidc-config-cronjob.yaml +++ b/services/vault/oidc-config-cronjob.yaml @@ -4,6 +4,8 @@ kind: CronJob metadata: name: vault-oidc-config namespace: vault + labels: + atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" concurrencyPolicy: Forbid