monitoring: fix infra scopes and add jetson metrics
This commit is contained in:
parent
3a798ae3b1
commit
fcc0a49369
@ -9,5 +9,6 @@ resources:
|
||||
- gitops-ui/kustomization.yaml
|
||||
- monitoring/kustomization.yaml
|
||||
- logging/kustomization.yaml
|
||||
- maintenance/kustomization.yaml
|
||||
- longhorn-ui/kustomization.yaml
|
||||
- ../platform/vault-csi/kustomization.yaml
|
||||
|
||||
@ -84,7 +84,18 @@ CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
|
||||
WORKER_TOTAL = len(WORKER_NODES)
|
||||
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
|
||||
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
|
||||
CP_ALLOWED_NS = "(^kube.*|.*-system$|^traefik$|^monitoring$)"
|
||||
# Namespaces considered infrastructure (excluded from workload counts)
|
||||
INFRA_NAMESPACES = [
|
||||
"kube-system",
|
||||
"longhorn-system",
|
||||
"metallb-system",
|
||||
"monitoring",
|
||||
"flux-system",
|
||||
"traefik",
|
||||
]
|
||||
INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$"
|
||||
# Namespaces allowed on control plane without counting as workloads
|
||||
CP_ALLOWED_NS = INFRA_REGEX
|
||||
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
|
||||
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
|
||||
CONTROL_WORKLOADS_EXPR = (
|
||||
@ -300,9 +311,9 @@ STUCK_TABLE_EXPR = (
|
||||
")"
|
||||
)
|
||||
|
||||
NAMESPACE_SCOPE_WORKLOAD = 'namespace!~"(^kube.*|.*-system$|^traefik$|^monitoring$)"'
|
||||
NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"'
|
||||
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
|
||||
NAMESPACE_SCOPE_INFRA = 'namespace=~"(^kube.*|.*-system$|^traefik$|^monitoring$)"'
|
||||
NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
|
||||
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
|
||||
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
|
||||
GPU_NODE_REGEX = "|".join(GPU_NODES)
|
||||
@ -1232,51 +1243,6 @@ def build_overview():
|
||||
links=link_to("atlas-storage"),
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
30,
|
||||
"Maintenance Sweepers Ready",
|
||||
'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100',
|
||||
{"h": 6, "w": 8, "x": 0, "y": 80},
|
||||
unit="percent",
|
||||
thresholds=PERCENT_THRESHOLDS,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
31,
|
||||
"Maintenance Cron Freshness (s)",
|
||||
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
|
||||
{"h": 6, "w": 8, "x": 8, "y": 80},
|
||||
unit="s",
|
||||
thresholds={
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 3600},
|
||||
{"color": "red", "value": 10800},
|
||||
],
|
||||
},
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
32,
|
||||
"Postmark Bounce Rate (1d)",
|
||||
'POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}',
|
||||
{"h": 6, "w": 8, "x": 16, "y": 80},
|
||||
unit="percent",
|
||||
thresholds={
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 2},
|
||||
{"color": "red", "value": 5},
|
||||
],
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"uid": "atlas-overview",
|
||||
"title": "Atlas Overview",
|
||||
@ -1743,6 +1709,33 @@ def build_storage_dashboard():
|
||||
time_from="90d",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
30,
|
||||
"Maintenance Sweepers Ready",
|
||||
'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100',
|
||||
{"h": 4, "w": 12, "x": 0, "y": 44},
|
||||
unit="percent",
|
||||
thresholds=PERCENT_THRESHOLDS,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
31,
|
||||
"Maintenance Cron Freshness (s)",
|
||||
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
|
||||
{"h": 4, "w": 12, "x": 12, "y": 44},
|
||||
unit="s",
|
||||
thresholds={
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 3600},
|
||||
{"color": "red", "value": 10800},
|
||||
],
|
||||
},
|
||||
)
|
||||
)
|
||||
return {
|
||||
"uid": "atlas-storage",
|
||||
"title": "Atlas Storage",
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
"links": [
|
||||
{
|
||||
"title": "Workload namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"targetBlank": false
|
||||
},
|
||||
{
|
||||
@ -67,7 +67,7 @@
|
||||
},
|
||||
{
|
||||
"title": "Infrastructure namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
@ -207,16 +207,16 @@
|
||||
"name": "namespace_scope_cpu",
|
||||
"label": "CPU namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
@ -226,7 +226,7 @@
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
@ -241,16 +241,16 @@
|
||||
"name": "namespace_scope_gpu",
|
||||
"label": "GPU namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
@ -260,7 +260,7 @@
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
@ -275,16 +275,16 @@
|
||||
"name": "namespace_scope_ram",
|
||||
"label": "RAM namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
@ -294,7 +294,7 @@
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
|
||||
@ -142,7 +142,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})",
|
||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
|
||||
@ -76,7 +76,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"}) or on() vector(0)",
|
||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"}) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -1447,7 +1447,7 @@
|
||||
"links": [
|
||||
{
|
||||
"title": "Workload namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"targetBlank": false
|
||||
},
|
||||
{
|
||||
@ -1457,7 +1457,7 @@
|
||||
},
|
||||
{
|
||||
"title": "Infrastructure namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
@ -1516,7 +1516,7 @@
|
||||
"links": [
|
||||
{
|
||||
"title": "Workload namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"targetBlank": false
|
||||
},
|
||||
{
|
||||
@ -1526,7 +1526,7 @@
|
||||
},
|
||||
{
|
||||
"title": "Infrastructure namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
@ -1585,7 +1585,7 @@
|
||||
"links": [
|
||||
{
|
||||
"title": "Workload namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22",
|
||||
"targetBlank": false
|
||||
},
|
||||
{
|
||||
@ -1595,7 +1595,7 @@
|
||||
},
|
||||
{
|
||||
"title": "Infrastructure namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22",
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
@ -2160,202 +2160,6 @@
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"type": "stat",
|
||||
"title": "Maintenance Sweepers Ready",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 80
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 50
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 75
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 91.5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"type": "stat",
|
||||
"title": "Maintenance Cron Freshness (s)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 80
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 3600
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 10800
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 32,
|
||||
"type": "stat",
|
||||
"title": "Postmark Bounce Rate (1d)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 80
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "POSTMARK_OUTBOUND_BOUNCE_RATE{window=\"1d\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
@ -2370,16 +2174,16 @@
|
||||
"name": "namespace_scope_cpu",
|
||||
"label": "CPU namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
@ -2389,7 +2193,7 @@
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
@ -2404,16 +2208,16 @@
|
||||
"name": "namespace_scope_gpu",
|
||||
"label": "GPU namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
@ -2423,7 +2227,7 @@
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
@ -2438,16 +2242,16 @@
|
||||
"name": "namespace_scope_ram",
|
||||
"label": "RAM namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
@ -2457,7 +2261,7 @@
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
|
||||
@ -200,7 +200,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})",
|
||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
|
||||
@ -409,6 +409,138 @@
|
||||
}
|
||||
},
|
||||
"timeFrom": "90d"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"type": "stat",
|
||||
"title": "Maintenance Sweepers Ready",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 44
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 50
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 75
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 91.5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"type": "stat",
|
||||
"title": "Maintenance Cron Freshness (s)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 44
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 3600
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 10800
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
|
||||
@ -28,13 +28,14 @@ spec:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/hostname
|
||||
- key: kubernetes.io/arch
|
||||
operator: In
|
||||
values:
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24
|
||||
- amd64
|
||||
- key: jetson
|
||||
operator: NotIn
|
||||
values:
|
||||
- "true"
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
containers:
|
||||
|
||||
@ -34,6 +34,7 @@ data:
|
||||
- uid: disk-pressure-root
|
||||
title: "Node rootfs high (>80%)"
|
||||
condition: C
|
||||
for: "10m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
@ -81,6 +82,7 @@ data:
|
||||
- uid: disk-growth-1h
|
||||
title: "Node rootfs growing fast (>1Gi in 1h)"
|
||||
condition: C
|
||||
for: "10m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
@ -133,6 +135,7 @@ data:
|
||||
- uid: cpu-high-10m
|
||||
title: "Node CPU high (>90% for 10m)"
|
||||
condition: C
|
||||
for: 10m
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
@ -185,6 +188,7 @@ data:
|
||||
- uid: maint-sweeper
|
||||
title: "Maintenance sweeper not ready"
|
||||
condition: C
|
||||
for: "5m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
@ -232,10 +236,11 @@ data:
|
||||
- uid: maint-cron-stale
|
||||
title: "Maintenance CronJobs stale (>3h since success)"
|
||||
condition: C
|
||||
for: "5m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
@ -284,6 +289,7 @@ data:
|
||||
- uid: postmark-bounce
|
||||
title: "Postmark bounce rate high"
|
||||
condition: C
|
||||
for: "10m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
@ -331,6 +337,7 @@ data:
|
||||
- uid: postmark-api-down
|
||||
title: "Postmark exporter down"
|
||||
condition: C
|
||||
for: "5m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
|
||||
@ -66,7 +66,7 @@ data:
|
||||
"links": [
|
||||
{
|
||||
"title": "Workload namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"targetBlank": false
|
||||
},
|
||||
{
|
||||
@ -76,7 +76,7 @@ data:
|
||||
},
|
||||
{
|
||||
"title": "Infrastructure namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
@ -216,16 +216,16 @@ data:
|
||||
"name": "namespace_scope_cpu",
|
||||
"label": "CPU namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
@ -235,7 +235,7 @@ data:
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
@ -250,16 +250,16 @@ data:
|
||||
"name": "namespace_scope_gpu",
|
||||
"label": "GPU namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
@ -269,7 +269,7 @@ data:
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
@ -284,16 +284,16 @@ data:
|
||||
"name": "namespace_scope_ram",
|
||||
"label": "RAM namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
@ -303,7 +303,7 @@ data:
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
|
||||
@ -151,7 +151,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})",
|
||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
|
||||
@ -85,7 +85,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"}) or on() vector(0)",
|
||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"}) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -1456,7 +1456,7 @@ data:
|
||||
"links": [
|
||||
{
|
||||
"title": "Workload namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"targetBlank": false
|
||||
},
|
||||
{
|
||||
@ -1466,7 +1466,7 @@ data:
|
||||
},
|
||||
{
|
||||
"title": "Infrastructure namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
@ -1525,7 +1525,7 @@ data:
|
||||
"links": [
|
||||
{
|
||||
"title": "Workload namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"targetBlank": false
|
||||
},
|
||||
{
|
||||
@ -1535,7 +1535,7 @@ data:
|
||||
},
|
||||
{
|
||||
"title": "Infrastructure namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
@ -1594,7 +1594,7 @@ data:
|
||||
"links": [
|
||||
{
|
||||
"title": "Workload namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22",
|
||||
"targetBlank": false
|
||||
},
|
||||
{
|
||||
@ -1604,7 +1604,7 @@ data:
|
||||
},
|
||||
{
|
||||
"title": "Infrastructure namespaces only",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22",
|
||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22",
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
@ -2169,202 +2169,6 @@ data:
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"type": "stat",
|
||||
"title": "Maintenance Sweepers Ready",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 80
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 50
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 75
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 91.5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"type": "stat",
|
||||
"title": "Maintenance Cron Freshness (s)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 80
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 3600
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 10800
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 32,
|
||||
"type": "stat",
|
||||
"title": "Postmark Bounce Rate (1d)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 80
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "POSTMARK_OUTBOUND_BOUNCE_RATE{window=\"1d\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
@ -2379,16 +2183,16 @@ data:
|
||||
"name": "namespace_scope_cpu",
|
||||
"label": "CPU namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
@ -2398,7 +2202,7 @@ data:
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
@ -2413,16 +2217,16 @@ data:
|
||||
"name": "namespace_scope_gpu",
|
||||
"label": "GPU namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
@ -2432,7 +2236,7 @@ data:
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
@ -2447,16 +2251,16 @@ data:
|
||||
"name": "namespace_scope_ram",
|
||||
"label": "RAM namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
@ -2466,7 +2270,7 @@ data:
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
||||
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
|
||||
@ -209,7 +209,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})",
|
||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
|
||||
@ -418,6 +418,138 @@ data:
|
||||
}
|
||||
},
|
||||
"timeFrom": "90d"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"type": "stat",
|
||||
"title": "Maintenance Sweepers Ready",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 44
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 50
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 75
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 91.5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"type": "stat",
|
||||
"title": "Maintenance Cron Freshness (s)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 44
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 3600
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 10800
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
|
||||
168
services/monitoring/jetson-tegrastats-exporter.yaml
Normal file
168
services/monitoring/jetson-tegrastats-exporter.yaml
Normal file
@ -0,0 +1,168 @@
|
||||
# services/monitoring/jetson-tegrastats-exporter.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: jetson-tegrastats-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: jetson-tegrastats-exporter
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: jetson-tegrastats-exporter
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: jetson-tegrastats-exporter
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "9100"
|
||||
spec:
|
||||
serviceAccountName: default
|
||||
hostPID: true
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
nodeSelector:
|
||||
jetson: "true"
|
||||
containers:
|
||||
- name: exporter
|
||||
# Exposes tegrastats output as Prometheus metrics for Jetson devices.
|
||||
image: python:3.10-slim
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
privileged: true
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 9100
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
env:
|
||||
- name: JETSON_EXPORTER_PORT
|
||||
value: "9100"
|
||||
volumeMounts:
|
||||
- name: script
|
||||
mountPath: /etc/tegrastats-exporter
|
||||
readOnly: true
|
||||
- name: tegrastats-bin
|
||||
mountPath: /host/usr/bin/tegrastats
|
||||
readOnly: true
|
||||
command:
|
||||
- python
|
||||
- /etc/tegrastats-exporter/exporter.py
|
||||
volumes:
|
||||
- name: script
|
||||
configMap:
|
||||
name: jetson-tegrastats-exporter-script
|
||||
defaultMode: 0555
|
||||
- name: tegrastats-bin
|
||||
hostPath:
|
||||
path: /usr/bin/tegrastats
|
||||
type: File
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: jetson-tegrastats-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: jetson-tegrastats-exporter
|
||||
spec:
|
||||
selector:
|
||||
app: jetson-tegrastats-exporter
|
||||
ports:
|
||||
- name: metrics
|
||||
port: 9100
|
||||
targetPort: metrics
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: jetson-tegrastats-exporter-script
|
||||
namespace: monitoring
|
||||
data:
|
||||
exporter.py: |
|
||||
import http.server
|
||||
import os
|
||||
import re
|
||||
import socketserver
|
||||
import subprocess
|
||||
import threading
|
||||
from time import time
|
||||
|
||||
PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
|
||||
METRICS = {
|
||||
"gr3d_freq_percent": 0.0,
|
||||
"gpu_temp_c": 0.0,
|
||||
"cpu_temp_c": 0.0,
|
||||
"ram_used_mb": 0.0,
|
||||
"ram_total_mb": 0.0,
|
||||
"power_5v_in_mw": 0.0,
|
||||
"last_scrape_ts": 0.0,
|
||||
}
|
||||
LOCK = threading.Lock()
|
||||
|
||||
def parse_line(line: str):
|
||||
updates = {}
|
||||
m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
|
||||
if m:
|
||||
updates["gr3d_freq_percent"] = float(m.group(1))
|
||||
m = re.search(r"GPU@(\\d+(?:\\.\\d+)?)C", line)
|
||||
if m:
|
||||
updates["gpu_temp_c"] = float(m.group(1))
|
||||
m = re.search(r"CPU@(\\d+(?:\\.\\d+)?)C", line)
|
||||
if m:
|
||||
updates["cpu_temp_c"] = float(m.group(1))
|
||||
m = re.search(r"RAM\\s+(\\d+)/(\\d+)MB", line)
|
||||
if m:
|
||||
updates["ram_used_mb"] = float(m.group(1))
|
||||
updates["ram_total_mb"] = float(m.group(2))
|
||||
m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line)
|
||||
if m:
|
||||
updates["power_5v_in_mw"] = float(m.group(1))
|
||||
with LOCK:
|
||||
METRICS.update(updates)
|
||||
METRICS["last_scrape_ts"] = time()
|
||||
|
||||
def run_tegrastats():
|
||||
proc = subprocess.Popen(
|
||||
["/host/usr/bin/tegrastats", "--interval", "1000"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
)
|
||||
for line in proc.stdout:
|
||||
parse_line(line)
|
||||
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path != "/metrics":
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
return
|
||||
with LOCK:
|
||||
metrics = METRICS.copy()
|
||||
out = []
|
||||
for k, v in metrics.items():
|
||||
out.append(f"# TYPE jetson_{k} gauge")
|
||||
out.append(f"jetson_{k} {v}")
|
||||
body = "\\n".join(out) + "\\n"
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/plain; version=0.0.4")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body.encode("utf-8"))
|
||||
|
||||
def log_message(self, fmt, *args):
|
||||
return
|
||||
|
||||
if __name__ == "__main__":
|
||||
t = threading.Thread(target=run_tegrastats, daemon=True)
|
||||
t.start()
|
||||
with socketserver.TCPServer(("", PORT), Handler) as httpd:
|
||||
httpd.serve_forever()
|
||||
@ -14,6 +14,7 @@ resources:
|
||||
- grafana-dashboard-gpu.yaml
|
||||
- grafana-dashboard-mail.yaml
|
||||
- dcgm-exporter.yaml
|
||||
- jetson-tegrastats-exporter.yaml
|
||||
- postmark-exporter-service.yaml
|
||||
- postmark-exporter-deployment.yaml
|
||||
- grafana-alerting-config.yaml
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user