From fcc0a493699501bace78430ff27d24ff6ce6f53a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 11 Jan 2026 23:46:24 -0300 Subject: [PATCH] monitoring: fix infra scopes and add jetson metrics --- .../flux-system/platform/kustomization.yaml | 1 + scripts/dashboards_render_atlas.py | 89 +++---- services/monitoring/dashboards/atlas-gpu.json | 28 +-- .../monitoring/dashboards/atlas-nodes.json | 2 +- .../monitoring/dashboards/atlas-overview.json | 234 ++---------------- .../monitoring/dashboards/atlas-pods.json | 2 +- .../monitoring/dashboards/atlas-storage.json | 132 ++++++++++ services/monitoring/dcgm-exporter.yaml | 11 +- .../monitoring/grafana-alerting-config.yaml | 9 +- .../monitoring/grafana-dashboard-gpu.yaml | 28 +-- .../monitoring/grafana-dashboard-nodes.yaml | 2 +- .../grafana-dashboard-overview.yaml | 234 ++---------------- .../monitoring/grafana-dashboard-pods.yaml | 2 +- .../monitoring/grafana-dashboard-storage.yaml | 132 ++++++++++ .../jetson-tegrastats-exporter.yaml | 168 +++++++++++++ services/monitoring/kustomization.yaml | 1 + 16 files changed, 559 insertions(+), 516 deletions(-) create mode 100644 services/monitoring/jetson-tegrastats-exporter.yaml diff --git a/clusters/atlas/flux-system/platform/kustomization.yaml b/clusters/atlas/flux-system/platform/kustomization.yaml index df226e2..7da2ca3 100644 --- a/clusters/atlas/flux-system/platform/kustomization.yaml +++ b/clusters/atlas/flux-system/platform/kustomization.yaml @@ -9,5 +9,6 @@ resources: - gitops-ui/kustomization.yaml - monitoring/kustomization.yaml - logging/kustomization.yaml + - maintenance/kustomization.yaml - longhorn-ui/kustomization.yaml - ../platform/vault-csi/kustomization.yaml diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 5474298..7cbb386 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -84,7 +84,18 @@ CONTROL_TOTAL = len(CONTROL_PLANE_NODES) WORKER_TOTAL = len(WORKER_NODES) CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" -CP_ALLOWED_NS = "(^kube.*|.*-system$|^traefik$|^monitoring$)" +# Namespaces considered infrastructure (excluded from workload counts) +INFRA_NAMESPACES = [ + "kube-system", + "longhorn-system", + "metallb-system", + "monitoring", + "flux-system", + "traefik", +] +INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$" +# Namespaces allowed on control plane without counting as workloads +CP_ALLOWED_NS = INFRA_REGEX LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4] CONTROL_WORKLOADS_EXPR = ( @@ -300,9 +311,9 @@ STUCK_TABLE_EXPR = ( ")" ) -NAMESPACE_SCOPE_WORKLOAD = 'namespace!~"(^kube.*|.*-system$|^traefik$|^monitoring$)"' +NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"' NAMESPACE_SCOPE_ALL = 'namespace=~".*"' -NAMESPACE_SCOPE_INFRA = 'namespace=~"(^kube.*|.*-system$|^traefik$|^monitoring$)"' +NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"' NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"] GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) @@ -1232,51 +1243,6 @@ def build_overview(): links=link_to("atlas-storage"), ) ) - panels.append( - stat_panel( - 30, - "Maintenance Sweepers Ready", - 'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100', - {"h": 6, "w": 8, "x": 0, "y": 80}, - unit="percent", - thresholds=PERCENT_THRESHOLDS, - ) - ) - panels.append( - stat_panel( - 31, - "Maintenance Cron Freshness (s)", - 'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})', - {"h": 6, "w": 8, "x": 8, "y": 80}, - unit="s", - thresholds={ - "mode": "absolute", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 3600}, - {"color": "red", "value": 10800}, - ], - }, - ) - ) - panels.append( - stat_panel( - 32, - "Postmark Bounce Rate (1d)", - 'POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}', - {"h": 6, "w": 8, "x": 16, "y": 80}, - unit="percent", - thresholds={ - "mode": "absolute", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 2}, - {"color": "red", "value": 5}, - ], - }, - ) - ) - return { "uid": "atlas-overview", "title": "Atlas Overview", @@ -1743,6 +1709,33 @@ def build_storage_dashboard(): time_from="90d", ) ) + panels.append( + stat_panel( + 30, + "Maintenance Sweepers Ready", + 'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100', + {"h": 4, "w": 12, "x": 0, "y": 44}, + unit="percent", + thresholds=PERCENT_THRESHOLDS, + ) + ) + panels.append( + stat_panel( + 31, + "Maintenance Cron Freshness (s)", + 'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})', + {"h": 4, "w": 12, "x": 12, "y": 44}, + unit="s", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 3600}, + {"color": "red", "value": 10800}, + ], + }, + ) + ) return { "uid": "atlas-storage", "title": "Atlas Storage", diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 2e71045..9460177 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -57,7 +57,7 @@ "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { @@ -67,7 +67,7 @@ }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], @@ -207,16 +207,16 @@ "name": "namespace_scope_cpu", "label": "CPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, { @@ -226,7 +226,7 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": false } ], @@ -241,16 +241,16 @@ "name": "namespace_scope_gpu", "label": "GPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, { @@ -260,7 +260,7 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": false } ], @@ -275,16 +275,16 @@ "name": "namespace_scope_ram", "label": "RAM namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, { @@ -294,7 +294,7 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": false } ], diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index 256bc18..499e14e 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -142,7 +142,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"})", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 44403fb..bef23e2 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -76,7 +76,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"}) or on() vector(0)", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"}) or on() vector(0)", "refId": "A" } ], @@ -1447,7 +1447,7 @@ "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { @@ -1457,7 +1457,7 @@ }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], @@ -1516,7 +1516,7 @@ "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { @@ -1526,7 +1526,7 @@ }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], @@ -1585,7 +1585,7 @@ "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22", "targetBlank": false }, { @@ -1595,7 +1595,7 @@ }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22", "targetBlank": false } ], @@ -2160,202 +2160,6 @@ } } ] - }, - { - "id": 30, - "type": "stat", - "title": "Maintenance Sweepers Ready", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 80 - }, - "targets": [ - { - "expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 75 - }, - { - "color": "red", - "value": 91.5 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 31, - "type": "stat", - "title": "Maintenance Cron Freshness (s)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 80 - }, - "targets": [ - { - "expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 3600 - }, - { - "color": "red", - "value": 10800 - } - ] - }, - "unit": "s", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 32, - "type": "stat", - "title": "Postmark Bounce Rate (1d)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 16, - "y": 80 - }, - "targets": [ - { - "expr": "POSTMARK_OUTBOUND_BOUNCE_RATE{window=\"1d\"}", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 2 - }, - { - "color": "red", - "value": 5 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } } ], "schemaVersion": 39, @@ -2370,16 +2174,16 @@ "name": "namespace_scope_cpu", "label": "CPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, { @@ -2389,7 +2193,7 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": false } ], @@ -2404,16 +2208,16 @@ "name": "namespace_scope_gpu", "label": "GPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, { @@ -2423,7 +2227,7 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": false } ], @@ -2438,16 +2242,16 @@ "name": "namespace_scope_ram", "label": "RAM namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, { @@ -2457,7 +2261,7 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": false } ], diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index 68429ec..a0f9e1c 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -200,7 +200,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"})", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json index 2e548b2..d93a941 100644 --- a/services/monitoring/dashboards/atlas-storage.json +++ b/services/monitoring/dashboards/atlas-storage.json @@ -409,6 +409,138 @@ } }, "timeFrom": "90d" + }, + { + "id": 30, + "type": "stat", + "title": "Maintenance Sweepers Ready", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 44 + }, + "targets": [ + { + "expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 75 + }, + { + "color": "red", + "value": 91.5 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 31, + "type": "stat", + "title": "Maintenance Cron Freshness (s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 44 + }, + "targets": [ + { + "expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 3600 + }, + { + "color": "red", + "value": 10800 + } + ] + }, + "unit": "s", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } } ], "time": { diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index cd37b7b..7627420 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -28,13 +28,14 @@ spec: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - - key: kubernetes.io/hostname + - key: kubernetes.io/arch operator: In values: - - titan-20 - - titan-21 - - titan-22 - - titan-24 + - amd64 + - key: jetson + operator: NotIn + values: + - "true" tolerations: - operator: Exists containers: diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index 7800d8d..c679bff 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -34,6 +34,7 @@ data: - uid: disk-pressure-root title: "Node rootfs high (>80%)" condition: C + for: "10m" data: - refId: A relativeTimeRange: @@ -81,6 +82,7 @@ data: - uid: disk-growth-1h title: "Node rootfs growing fast (>1Gi in 1h)" condition: C + for: "10m" data: - refId: A relativeTimeRange: @@ -133,6 +135,7 @@ data: - uid: cpu-high-10m title: "Node CPU high (>90% for 10m)" condition: C + for: 10m data: - refId: A relativeTimeRange: @@ -185,6 +188,7 @@ data: - uid: maint-sweeper title: "Maintenance sweeper not ready" condition: C + for: "5m" data: - refId: A relativeTimeRange: @@ -232,10 +236,11 @@ data: - uid: maint-cron-stale title: "Maintenance CronJobs stale (>3h since success)" condition: C + for: "5m" data: - refId: A relativeTimeRange: - from: 0 + from: 300 to: 0 datasourceUid: atlas-vm model: @@ -284,6 +289,7 @@ data: - uid: postmark-bounce title: "Postmark bounce rate high" condition: C + for: "10m" data: - refId: A relativeTimeRange: @@ -331,6 +337,7 @@ data: - uid: postmark-api-down title: "Postmark exporter down" condition: C + for: "5m" data: - refId: A relativeTimeRange: diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 56965eb..3f7bbec 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -66,7 +66,7 @@ data: "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { @@ -76,7 +76,7 @@ data: }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], @@ -216,16 +216,16 @@ data: "name": "namespace_scope_cpu", "label": "CPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, { @@ -235,7 +235,7 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": false } ], @@ -250,16 +250,16 @@ data: "name": "namespace_scope_gpu", "label": "GPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, { @@ -269,7 +269,7 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": false } ], @@ -284,16 +284,16 @@ data: "name": "namespace_scope_ram", "label": "RAM namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, { @@ -303,7 +303,7 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": false } ], diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index 368f80f..42d2c3f 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -151,7 +151,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"})", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 062310c..d89255c 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -85,7 +85,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"}) or on() vector(0)", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"}) or on() vector(0)", "refId": "A" } ], @@ -1456,7 +1456,7 @@ data: "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { @@ -1466,7 +1466,7 @@ data: }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], @@ -1525,7 +1525,7 @@ data: "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { @@ -1535,7 +1535,7 @@ data: }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], @@ -1594,7 +1594,7 @@ data: "links": [ { "title": "Workload namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22", "targetBlank": false }, { @@ -1604,7 +1604,7 @@ data: }, { "title": "Infrastructure namespaces only", - "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22", "targetBlank": false } ], @@ -2169,202 +2169,6 @@ data: } } ] - }, - { - "id": 30, - "type": "stat", - "title": "Maintenance Sweepers Ready", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 80 - }, - "targets": [ - { - "expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 75 - }, - { - "color": "red", - "value": 91.5 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 31, - "type": "stat", - "title": "Maintenance Cron Freshness (s)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 80 - }, - "targets": [ - { - "expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 3600 - }, - { - "color": "red", - "value": 10800 - } - ] - }, - "unit": "s", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 32, - "type": "stat", - "title": "Postmark Bounce Rate (1d)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 16, - "y": 80 - }, - "targets": [ - { - "expr": "POSTMARK_OUTBOUND_BOUNCE_RATE{window=\"1d\"}", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 2 - }, - { - "color": "red", - "value": 5 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } } ], "schemaVersion": 39, @@ -2379,16 +2183,16 @@ data: "name": "namespace_scope_cpu", "label": "CPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, { @@ -2398,7 +2202,7 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": false } ], @@ -2413,16 +2217,16 @@ data: "name": "namespace_scope_gpu", "label": "GPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, { @@ -2432,7 +2236,7 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": false } ], @@ -2447,16 +2251,16 @@ data: "name": "namespace_scope_ram", "label": "RAM namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": true }, { @@ -2466,7 +2270,7 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"", "selected": false } ], diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 0f43a05..1ca5afb 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -209,7 +209,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"})", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml index 8aef820..5ce4186 100644 --- a/services/monitoring/grafana-dashboard-storage.yaml +++ b/services/monitoring/grafana-dashboard-storage.yaml @@ -418,6 +418,138 @@ data: } }, "timeFrom": "90d" + }, + { + "id": 30, + "type": "stat", + "title": "Maintenance Sweepers Ready", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 44 + }, + "targets": [ + { + "expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 75 + }, + { + "color": "red", + "value": 91.5 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 31, + "type": "stat", + "title": "Maintenance Cron Freshness (s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 44 + }, + "targets": [ + { + "expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 3600 + }, + { + "color": "red", + "value": 10800 + } + ] + }, + "unit": "s", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } } ], "time": { diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml new file mode 100644 index 0000000..32a4455 --- /dev/null +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -0,0 +1,168 @@ +# services/monitoring/jetson-tegrastats-exporter.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: jetson-tegrastats-exporter + namespace: monitoring + labels: + app: jetson-tegrastats-exporter +spec: + selector: + matchLabels: + app: jetson-tegrastats-exporter + template: + metadata: + labels: + app: jetson-tegrastats-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9100" + spec: + serviceAccountName: default + hostPID: true + tolerations: + - operator: Exists + nodeSelector: + jetson: "true" + containers: + - name: exporter + # Exposes tegrastats output as Prometheus metrics for Jetson devices. + image: python:3.10-slim + imagePullPolicy: IfNotPresent + securityContext: + privileged: true + ports: + - name: metrics + containerPort: 9100 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + env: + - name: JETSON_EXPORTER_PORT + value: "9100" + volumeMounts: + - name: script + mountPath: /etc/tegrastats-exporter + readOnly: true + - name: tegrastats-bin + mountPath: /host/usr/bin/tegrastats + readOnly: true + command: + - python + - /etc/tegrastats-exporter/exporter.py + volumes: + - name: script + configMap: + name: jetson-tegrastats-exporter-script + defaultMode: 0555 + - name: tegrastats-bin + hostPath: + path: /usr/bin/tegrastats + type: File +--- +apiVersion: v1 +kind: Service +metadata: + name: jetson-tegrastats-exporter + namespace: monitoring + labels: + app: jetson-tegrastats-exporter +spec: + selector: + app: jetson-tegrastats-exporter + ports: + - name: metrics + port: 9100 + targetPort: metrics +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: jetson-tegrastats-exporter-script + namespace: monitoring +data: + exporter.py: | + import http.server + import os + import re + import socketserver + import subprocess + import threading + from time import time + + PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) + METRICS = { + "gr3d_freq_percent": 0.0, + "gpu_temp_c": 0.0, + "cpu_temp_c": 0.0, + "ram_used_mb": 0.0, + "ram_total_mb": 0.0, + "power_5v_in_mw": 0.0, + "last_scrape_ts": 0.0, + } + LOCK = threading.Lock() + + def parse_line(line: str): + updates = {} + m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line) + if m: + updates["gr3d_freq_percent"] = float(m.group(1)) + m = re.search(r"GPU@(\\d+(?:\\.\\d+)?)C", line) + if m: + updates["gpu_temp_c"] = float(m.group(1)) + m = re.search(r"CPU@(\\d+(?:\\.\\d+)?)C", line) + if m: + updates["cpu_temp_c"] = float(m.group(1)) + m = re.search(r"RAM\\s+(\\d+)/(\\d+)MB", line) + if m: + updates["ram_used_mb"] = float(m.group(1)) + updates["ram_total_mb"] = float(m.group(2)) + m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line) + if m: + updates["power_5v_in_mw"] = float(m.group(1)) + with LOCK: + METRICS.update(updates) + METRICS["last_scrape_ts"] = time() + + def run_tegrastats(): + proc = subprocess.Popen( + ["/host/usr/bin/tegrastats", "--interval", "1000"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + for line in proc.stdout: + parse_line(line) + + class Handler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + if self.path != "/metrics": + self.send_response(404) + self.end_headers() + return + with LOCK: + metrics = METRICS.copy() + out = [] + for k, v in metrics.items(): + out.append(f"# TYPE jetson_{k} gauge") + out.append(f"jetson_{k} {v}") + body = "\\n".join(out) + "\\n" + self.send_response(200) + self.send_header("Content-Type", "text/plain; version=0.0.4") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body.encode("utf-8")) + + def log_message(self, fmt, *args): + return + + if __name__ == "__main__": + t = threading.Thread(target=run_tegrastats, daemon=True) + t.start() + with socketserver.TCPServer(("", PORT), Handler) as httpd: + httpd.serve_forever() diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 7198ffc..9f0e8ca 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -14,6 +14,7 @@ resources: - grafana-dashboard-gpu.yaml - grafana-dashboard-mail.yaml - dcgm-exporter.yaml + - jetson-tegrastats-exporter.yaml - postmark-exporter-service.yaml - postmark-exporter-deployment.yaml - grafana-alerting-config.yaml