monitoring: fix infra scopes and add jetson metrics
This commit is contained in:
parent
3a798ae3b1
commit
fcc0a49369
@ -9,5 +9,6 @@ resources:
|
|||||||
- gitops-ui/kustomization.yaml
|
- gitops-ui/kustomization.yaml
|
||||||
- monitoring/kustomization.yaml
|
- monitoring/kustomization.yaml
|
||||||
- logging/kustomization.yaml
|
- logging/kustomization.yaml
|
||||||
|
- maintenance/kustomization.yaml
|
||||||
- longhorn-ui/kustomization.yaml
|
- longhorn-ui/kustomization.yaml
|
||||||
- ../platform/vault-csi/kustomization.yaml
|
- ../platform/vault-csi/kustomization.yaml
|
||||||
|
|||||||
@ -84,7 +84,18 @@ CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
|
|||||||
WORKER_TOTAL = len(WORKER_NODES)
|
WORKER_TOTAL = len(WORKER_NODES)
|
||||||
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
|
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
|
||||||
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
|
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
|
||||||
CP_ALLOWED_NS = "(^kube.*|.*-system$|^traefik$|^monitoring$)"
|
# Namespaces considered infrastructure (excluded from workload counts)
|
||||||
|
INFRA_NAMESPACES = [
|
||||||
|
"kube-system",
|
||||||
|
"longhorn-system",
|
||||||
|
"metallb-system",
|
||||||
|
"monitoring",
|
||||||
|
"flux-system",
|
||||||
|
"traefik",
|
||||||
|
]
|
||||||
|
INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$"
|
||||||
|
# Namespaces allowed on control plane without counting as workloads
|
||||||
|
CP_ALLOWED_NS = INFRA_REGEX
|
||||||
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
|
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
|
||||||
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
|
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
|
||||||
CONTROL_WORKLOADS_EXPR = (
|
CONTROL_WORKLOADS_EXPR = (
|
||||||
@ -300,9 +311,9 @@ STUCK_TABLE_EXPR = (
|
|||||||
")"
|
")"
|
||||||
)
|
)
|
||||||
|
|
||||||
NAMESPACE_SCOPE_WORKLOAD = 'namespace!~"(^kube.*|.*-system$|^traefik$|^monitoring$)"'
|
NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"'
|
||||||
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
|
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
|
||||||
NAMESPACE_SCOPE_INFRA = 'namespace=~"(^kube.*|.*-system$|^traefik$|^monitoring$)"'
|
NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
|
||||||
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
|
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
|
||||||
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
|
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
|
||||||
GPU_NODE_REGEX = "|".join(GPU_NODES)
|
GPU_NODE_REGEX = "|".join(GPU_NODES)
|
||||||
@ -1232,51 +1243,6 @@ def build_overview():
|
|||||||
links=link_to("atlas-storage"),
|
links=link_to("atlas-storage"),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
30,
|
|
||||||
"Maintenance Sweepers Ready",
|
|
||||||
'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100',
|
|
||||||
{"h": 6, "w": 8, "x": 0, "y": 80},
|
|
||||||
unit="percent",
|
|
||||||
thresholds=PERCENT_THRESHOLDS,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
31,
|
|
||||||
"Maintenance Cron Freshness (s)",
|
|
||||||
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
|
|
||||||
{"h": 6, "w": 8, "x": 8, "y": 80},
|
|
||||||
unit="s",
|
|
||||||
thresholds={
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{"color": "green", "value": None},
|
|
||||||
{"color": "yellow", "value": 3600},
|
|
||||||
{"color": "red", "value": 10800},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
32,
|
|
||||||
"Postmark Bounce Rate (1d)",
|
|
||||||
'POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}',
|
|
||||||
{"h": 6, "w": 8, "x": 16, "y": 80},
|
|
||||||
unit="percent",
|
|
||||||
thresholds={
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{"color": "green", "value": None},
|
|
||||||
{"color": "yellow", "value": 2},
|
|
||||||
{"color": "red", "value": 5},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"uid": "atlas-overview",
|
"uid": "atlas-overview",
|
||||||
"title": "Atlas Overview",
|
"title": "Atlas Overview",
|
||||||
@ -1743,6 +1709,33 @@ def build_storage_dashboard():
|
|||||||
time_from="90d",
|
time_from="90d",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
panels.append(
|
||||||
|
stat_panel(
|
||||||
|
30,
|
||||||
|
"Maintenance Sweepers Ready",
|
||||||
|
'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100',
|
||||||
|
{"h": 4, "w": 12, "x": 0, "y": 44},
|
||||||
|
unit="percent",
|
||||||
|
thresholds=PERCENT_THRESHOLDS,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
panels.append(
|
||||||
|
stat_panel(
|
||||||
|
31,
|
||||||
|
"Maintenance Cron Freshness (s)",
|
||||||
|
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
|
||||||
|
{"h": 4, "w": 12, "x": 12, "y": 44},
|
||||||
|
unit="s",
|
||||||
|
thresholds={
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": None},
|
||||||
|
{"color": "yellow", "value": 3600},
|
||||||
|
{"color": "red", "value": 10800},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
return {
|
return {
|
||||||
"uid": "atlas-storage",
|
"uid": "atlas-storage",
|
||||||
"title": "Atlas Storage",
|
"title": "Atlas Storage",
|
||||||
|
|||||||
@ -57,7 +57,7 @@
|
|||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Workload namespaces only",
|
"title": "Workload namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -67,7 +67,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Infrastructure namespaces only",
|
"title": "Infrastructure namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -207,16 +207,16 @@
|
|||||||
"name": "namespace_scope_cpu",
|
"name": "namespace_scope_cpu",
|
||||||
"label": "CPU namespace filter",
|
"label": "CPU namespace filter",
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"current": {
|
"current": {
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
"options": [
|
"options": [
|
||||||
{
|
{
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -226,7 +226,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"text": "infrastructure namespaces only",
|
"text": "infrastructure namespaces only",
|
||||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": false
|
"selected": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -241,16 +241,16 @@
|
|||||||
"name": "namespace_scope_gpu",
|
"name": "namespace_scope_gpu",
|
||||||
"label": "GPU namespace filter",
|
"label": "GPU namespace filter",
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"current": {
|
"current": {
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
"options": [
|
"options": [
|
||||||
{
|
{
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -260,7 +260,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"text": "infrastructure namespaces only",
|
"text": "infrastructure namespaces only",
|
||||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": false
|
"selected": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -275,16 +275,16 @@
|
|||||||
"name": "namespace_scope_ram",
|
"name": "namespace_scope_ram",
|
||||||
"label": "RAM namespace filter",
|
"label": "RAM namespace filter",
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"current": {
|
"current": {
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
"options": [
|
"options": [
|
||||||
{
|
{
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -294,7 +294,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"text": "infrastructure namespaces only",
|
"text": "infrastructure namespaces only",
|
||||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": false
|
"selected": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@ -142,7 +142,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})",
|
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"})",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@ -76,7 +76,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"}) or on() vector(0)",
|
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"}) or on() vector(0)",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -1447,7 +1447,7 @@
|
|||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Workload namespaces only",
|
"title": "Workload namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
"url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1457,7 +1457,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Infrastructure namespaces only",
|
"title": "Infrastructure namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
"url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -1516,7 +1516,7 @@
|
|||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Workload namespaces only",
|
"title": "Workload namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1526,7 +1526,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Infrastructure namespaces only",
|
"title": "Infrastructure namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -1585,7 +1585,7 @@
|
|||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Workload namespaces only",
|
"title": "Workload namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22",
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1595,7 +1595,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Infrastructure namespaces only",
|
"title": "Infrastructure namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22",
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -2160,202 +2160,6 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 30,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "Maintenance Sweepers Ready",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 6,
|
|
||||||
"w": 8,
|
|
||||||
"x": 0,
|
|
||||||
"y": 80
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "yellow",
|
|
||||||
"value": 50
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "orange",
|
|
||||||
"value": 75
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "red",
|
|
||||||
"value": 91.5
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "percent",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "value"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 31,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "Maintenance Cron Freshness (s)",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 6,
|
|
||||||
"w": 8,
|
|
||||||
"x": 8,
|
|
||||||
"y": 80
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "yellow",
|
|
||||||
"value": 3600
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "red",
|
|
||||||
"value": 10800
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "s",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "value"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 32,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "Postmark Bounce Rate (1d)",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 6,
|
|
||||||
"w": 8,
|
|
||||||
"x": 16,
|
|
||||||
"y": 80
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "POSTMARK_OUTBOUND_BOUNCE_RATE{window=\"1d\"}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "yellow",
|
|
||||||
"value": 2
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "red",
|
|
||||||
"value": 5
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "percent",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "value"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"schemaVersion": 39,
|
"schemaVersion": 39,
|
||||||
@ -2370,16 +2174,16 @@
|
|||||||
"name": "namespace_scope_cpu",
|
"name": "namespace_scope_cpu",
|
||||||
"label": "CPU namespace filter",
|
"label": "CPU namespace filter",
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"current": {
|
"current": {
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
"options": [
|
"options": [
|
||||||
{
|
{
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -2389,7 +2193,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"text": "infrastructure namespaces only",
|
"text": "infrastructure namespaces only",
|
||||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": false
|
"selected": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -2404,16 +2208,16 @@
|
|||||||
"name": "namespace_scope_gpu",
|
"name": "namespace_scope_gpu",
|
||||||
"label": "GPU namespace filter",
|
"label": "GPU namespace filter",
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"current": {
|
"current": {
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
"options": [
|
"options": [
|
||||||
{
|
{
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -2423,7 +2227,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"text": "infrastructure namespaces only",
|
"text": "infrastructure namespaces only",
|
||||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": false
|
"selected": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -2438,16 +2242,16 @@
|
|||||||
"name": "namespace_scope_ram",
|
"name": "namespace_scope_ram",
|
||||||
"label": "RAM namespace filter",
|
"label": "RAM namespace filter",
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"current": {
|
"current": {
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
"options": [
|
"options": [
|
||||||
{
|
{
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -2457,7 +2261,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"text": "infrastructure namespaces only",
|
"text": "infrastructure namespaces only",
|
||||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": false
|
"selected": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@ -200,7 +200,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})",
|
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"})",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@ -409,6 +409,138 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"timeFrom": "90d"
|
"timeFrom": "90d"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 30,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Maintenance Sweepers Ready",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 4,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 44
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "thresholds"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "yellow",
|
||||||
|
"value": 50
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "orange",
|
||||||
|
"value": 75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 91.5
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "percent",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"textMode": "value"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 31,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Maintenance Cron Freshness (s)",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 4,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 44
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "thresholds"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "yellow",
|
||||||
|
"value": 3600
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 10800
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "s",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"textMode": "value"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -28,13 +28,14 @@ spec:
|
|||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
nodeSelectorTerms:
|
nodeSelectorTerms:
|
||||||
- matchExpressions:
|
- matchExpressions:
|
||||||
- key: kubernetes.io/hostname
|
- key: kubernetes.io/arch
|
||||||
operator: In
|
operator: In
|
||||||
values:
|
values:
|
||||||
- titan-20
|
- amd64
|
||||||
- titan-21
|
- key: jetson
|
||||||
- titan-22
|
operator: NotIn
|
||||||
- titan-24
|
values:
|
||||||
|
- "true"
|
||||||
tolerations:
|
tolerations:
|
||||||
- operator: Exists
|
- operator: Exists
|
||||||
containers:
|
containers:
|
||||||
|
|||||||
@ -34,6 +34,7 @@ data:
|
|||||||
- uid: disk-pressure-root
|
- uid: disk-pressure-root
|
||||||
title: "Node rootfs high (>80%)"
|
title: "Node rootfs high (>80%)"
|
||||||
condition: C
|
condition: C
|
||||||
|
for: "10m"
|
||||||
data:
|
data:
|
||||||
- refId: A
|
- refId: A
|
||||||
relativeTimeRange:
|
relativeTimeRange:
|
||||||
@ -81,6 +82,7 @@ data:
|
|||||||
- uid: disk-growth-1h
|
- uid: disk-growth-1h
|
||||||
title: "Node rootfs growing fast (>1Gi in 1h)"
|
title: "Node rootfs growing fast (>1Gi in 1h)"
|
||||||
condition: C
|
condition: C
|
||||||
|
for: "10m"
|
||||||
data:
|
data:
|
||||||
- refId: A
|
- refId: A
|
||||||
relativeTimeRange:
|
relativeTimeRange:
|
||||||
@ -133,6 +135,7 @@ data:
|
|||||||
- uid: cpu-high-10m
|
- uid: cpu-high-10m
|
||||||
title: "Node CPU high (>90% for 10m)"
|
title: "Node CPU high (>90% for 10m)"
|
||||||
condition: C
|
condition: C
|
||||||
|
for: 10m
|
||||||
data:
|
data:
|
||||||
- refId: A
|
- refId: A
|
||||||
relativeTimeRange:
|
relativeTimeRange:
|
||||||
@ -185,6 +188,7 @@ data:
|
|||||||
- uid: maint-sweeper
|
- uid: maint-sweeper
|
||||||
title: "Maintenance sweeper not ready"
|
title: "Maintenance sweeper not ready"
|
||||||
condition: C
|
condition: C
|
||||||
|
for: "5m"
|
||||||
data:
|
data:
|
||||||
- refId: A
|
- refId: A
|
||||||
relativeTimeRange:
|
relativeTimeRange:
|
||||||
@ -232,10 +236,11 @@ data:
|
|||||||
- uid: maint-cron-stale
|
- uid: maint-cron-stale
|
||||||
title: "Maintenance CronJobs stale (>3h since success)"
|
title: "Maintenance CronJobs stale (>3h since success)"
|
||||||
condition: C
|
condition: C
|
||||||
|
for: "5m"
|
||||||
data:
|
data:
|
||||||
- refId: A
|
- refId: A
|
||||||
relativeTimeRange:
|
relativeTimeRange:
|
||||||
from: 0
|
from: 300
|
||||||
to: 0
|
to: 0
|
||||||
datasourceUid: atlas-vm
|
datasourceUid: atlas-vm
|
||||||
model:
|
model:
|
||||||
@ -284,6 +289,7 @@ data:
|
|||||||
- uid: postmark-bounce
|
- uid: postmark-bounce
|
||||||
title: "Postmark bounce rate high"
|
title: "Postmark bounce rate high"
|
||||||
condition: C
|
condition: C
|
||||||
|
for: "10m"
|
||||||
data:
|
data:
|
||||||
- refId: A
|
- refId: A
|
||||||
relativeTimeRange:
|
relativeTimeRange:
|
||||||
@ -331,6 +337,7 @@ data:
|
|||||||
- uid: postmark-api-down
|
- uid: postmark-api-down
|
||||||
title: "Postmark exporter down"
|
title: "Postmark exporter down"
|
||||||
condition: C
|
condition: C
|
||||||
|
for: "5m"
|
||||||
data:
|
data:
|
||||||
- refId: A
|
- refId: A
|
||||||
relativeTimeRange:
|
relativeTimeRange:
|
||||||
|
|||||||
@ -66,7 +66,7 @@ data:
|
|||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Workload namespaces only",
|
"title": "Workload namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -76,7 +76,7 @@ data:
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Infrastructure namespaces only",
|
"title": "Infrastructure namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -216,16 +216,16 @@ data:
|
|||||||
"name": "namespace_scope_cpu",
|
"name": "namespace_scope_cpu",
|
||||||
"label": "CPU namespace filter",
|
"label": "CPU namespace filter",
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"current": {
|
"current": {
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
"options": [
|
"options": [
|
||||||
{
|
{
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -235,7 +235,7 @@ data:
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"text": "infrastructure namespaces only",
|
"text": "infrastructure namespaces only",
|
||||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": false
|
"selected": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -250,16 +250,16 @@ data:
|
|||||||
"name": "namespace_scope_gpu",
|
"name": "namespace_scope_gpu",
|
||||||
"label": "GPU namespace filter",
|
"label": "GPU namespace filter",
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"current": {
|
"current": {
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
"options": [
|
"options": [
|
||||||
{
|
{
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -269,7 +269,7 @@ data:
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"text": "infrastructure namespaces only",
|
"text": "infrastructure namespaces only",
|
||||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": false
|
"selected": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -284,16 +284,16 @@ data:
|
|||||||
"name": "namespace_scope_ram",
|
"name": "namespace_scope_ram",
|
||||||
"label": "RAM namespace filter",
|
"label": "RAM namespace filter",
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"current": {
|
"current": {
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
"options": [
|
"options": [
|
||||||
{
|
{
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -303,7 +303,7 @@ data:
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"text": "infrastructure namespaces only",
|
"text": "infrastructure namespaces only",
|
||||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": false
|
"selected": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@ -151,7 +151,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})",
|
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"})",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@ -85,7 +85,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"}) or on() vector(0)",
|
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"}) or on() vector(0)",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -1456,7 +1456,7 @@ data:
|
|||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Workload namespaces only",
|
"title": "Workload namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
"url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1466,7 +1466,7 @@ data:
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Infrastructure namespaces only",
|
"title": "Infrastructure namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
"url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -1525,7 +1525,7 @@ data:
|
|||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Workload namespaces only",
|
"title": "Workload namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1535,7 +1535,7 @@ data:
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Infrastructure namespaces only",
|
"title": "Infrastructure namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -1594,7 +1594,7 @@ data:
|
|||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Workload namespaces only",
|
"title": "Workload namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22",
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1604,7 +1604,7 @@ data:
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Infrastructure namespaces only",
|
"title": "Infrastructure namespaces only",
|
||||||
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22",
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Cflux-system%7Ctraefik%29%24%22",
|
||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -2169,202 +2169,6 @@ data:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 30,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "Maintenance Sweepers Ready",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 6,
|
|
||||||
"w": 8,
|
|
||||||
"x": 0,
|
|
||||||
"y": 80
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "yellow",
|
|
||||||
"value": 50
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "orange",
|
|
||||||
"value": 75
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "red",
|
|
||||||
"value": 91.5
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "percent",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "value"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 31,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "Maintenance Cron Freshness (s)",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 6,
|
|
||||||
"w": 8,
|
|
||||||
"x": 8,
|
|
||||||
"y": 80
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "yellow",
|
|
||||||
"value": 3600
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "red",
|
|
||||||
"value": 10800
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "s",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "value"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 32,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "Postmark Bounce Rate (1d)",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 6,
|
|
||||||
"w": 8,
|
|
||||||
"x": 16,
|
|
||||||
"y": 80
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "POSTMARK_OUTBOUND_BOUNCE_RATE{window=\"1d\"}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "yellow",
|
|
||||||
"value": 2
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "red",
|
|
||||||
"value": 5
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "percent",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "value"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"schemaVersion": 39,
|
"schemaVersion": 39,
|
||||||
@ -2379,16 +2183,16 @@ data:
|
|||||||
"name": "namespace_scope_cpu",
|
"name": "namespace_scope_cpu",
|
||||||
"label": "CPU namespace filter",
|
"label": "CPU namespace filter",
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"current": {
|
"current": {
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
"options": [
|
"options": [
|
||||||
{
|
{
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -2398,7 +2202,7 @@ data:
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"text": "infrastructure namespaces only",
|
"text": "infrastructure namespaces only",
|
||||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": false
|
"selected": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -2413,16 +2217,16 @@ data:
|
|||||||
"name": "namespace_scope_gpu",
|
"name": "namespace_scope_gpu",
|
||||||
"label": "GPU namespace filter",
|
"label": "GPU namespace filter",
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"current": {
|
"current": {
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
"options": [
|
"options": [
|
||||||
{
|
{
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -2432,7 +2236,7 @@ data:
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"text": "infrastructure namespaces only",
|
"text": "infrastructure namespaces only",
|
||||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": false
|
"selected": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -2447,16 +2251,16 @@ data:
|
|||||||
"name": "namespace_scope_ram",
|
"name": "namespace_scope_ram",
|
||||||
"label": "RAM namespace filter",
|
"label": "RAM namespace filter",
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"query": "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"current": {
|
"current": {
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
"options": [
|
"options": [
|
||||||
{
|
{
|
||||||
"text": "workload namespaces only",
|
"text": "workload namespaces only",
|
||||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": true
|
"selected": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -2466,7 +2270,7 @@ data:
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"text": "infrastructure namespaces only",
|
"text": "infrastructure namespaces only",
|
||||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
|
"value": "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"",
|
||||||
"selected": false
|
"selected": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@ -209,7 +209,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})",
|
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|flux-system|traefik)$\"})",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@ -418,6 +418,138 @@ data:
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"timeFrom": "90d"
|
"timeFrom": "90d"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 30,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Maintenance Sweepers Ready",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 4,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 44
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "thresholds"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "yellow",
|
||||||
|
"value": 50
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "orange",
|
||||||
|
"value": 75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 91.5
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "percent",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"textMode": "value"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 31,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Maintenance Cron Freshness (s)",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 4,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 44
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "thresholds"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "yellow",
|
||||||
|
"value": 3600
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 10800
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "s",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"textMode": "value"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
168
services/monitoring/jetson-tegrastats-exporter.yaml
Normal file
168
services/monitoring/jetson-tegrastats-exporter.yaml
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
# services/monitoring/jetson-tegrastats-exporter.yaml
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: jetson-tegrastats-exporter
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
app: jetson-tegrastats-exporter
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: jetson-tegrastats-exporter
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: jetson-tegrastats-exporter
|
||||||
|
annotations:
|
||||||
|
prometheus.io/scrape: "true"
|
||||||
|
prometheus.io/port: "9100"
|
||||||
|
spec:
|
||||||
|
serviceAccountName: default
|
||||||
|
hostPID: true
|
||||||
|
tolerations:
|
||||||
|
- operator: Exists
|
||||||
|
nodeSelector:
|
||||||
|
jetson: "true"
|
||||||
|
containers:
|
||||||
|
- name: exporter
|
||||||
|
# Exposes tegrastats output as Prometheus metrics for Jetson devices.
|
||||||
|
image: python:3.10-slim
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
ports:
|
||||||
|
- name: metrics
|
||||||
|
containerPort: 9100
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 50m
|
||||||
|
memory: 64Mi
|
||||||
|
limits:
|
||||||
|
cpu: 200m
|
||||||
|
memory: 256Mi
|
||||||
|
env:
|
||||||
|
- name: JETSON_EXPORTER_PORT
|
||||||
|
value: "9100"
|
||||||
|
volumeMounts:
|
||||||
|
- name: script
|
||||||
|
mountPath: /etc/tegrastats-exporter
|
||||||
|
readOnly: true
|
||||||
|
- name: tegrastats-bin
|
||||||
|
mountPath: /host/usr/bin/tegrastats
|
||||||
|
readOnly: true
|
||||||
|
command:
|
||||||
|
- python
|
||||||
|
- /etc/tegrastats-exporter/exporter.py
|
||||||
|
volumes:
|
||||||
|
- name: script
|
||||||
|
configMap:
|
||||||
|
name: jetson-tegrastats-exporter-script
|
||||||
|
defaultMode: 0555
|
||||||
|
- name: tegrastats-bin
|
||||||
|
hostPath:
|
||||||
|
path: /usr/bin/tegrastats
|
||||||
|
type: File
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: jetson-tegrastats-exporter
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
app: jetson-tegrastats-exporter
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: jetson-tegrastats-exporter
|
||||||
|
ports:
|
||||||
|
- name: metrics
|
||||||
|
port: 9100
|
||||||
|
targetPort: metrics
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: jetson-tegrastats-exporter-script
|
||||||
|
namespace: monitoring
|
||||||
|
data:
|
||||||
|
exporter.py: |
|
||||||
|
import http.server
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import socketserver
|
||||||
|
import subprocess
|
||||||
|
import threading
|
||||||
|
from time import time
|
||||||
|
|
||||||
|
PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
|
||||||
|
METRICS = {
|
||||||
|
"gr3d_freq_percent": 0.0,
|
||||||
|
"gpu_temp_c": 0.0,
|
||||||
|
"cpu_temp_c": 0.0,
|
||||||
|
"ram_used_mb": 0.0,
|
||||||
|
"ram_total_mb": 0.0,
|
||||||
|
"power_5v_in_mw": 0.0,
|
||||||
|
"last_scrape_ts": 0.0,
|
||||||
|
}
|
||||||
|
LOCK = threading.Lock()
|
||||||
|
|
||||||
|
def parse_line(line: str):
|
||||||
|
updates = {}
|
||||||
|
m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
|
||||||
|
if m:
|
||||||
|
updates["gr3d_freq_percent"] = float(m.group(1))
|
||||||
|
m = re.search(r"GPU@(\\d+(?:\\.\\d+)?)C", line)
|
||||||
|
if m:
|
||||||
|
updates["gpu_temp_c"] = float(m.group(1))
|
||||||
|
m = re.search(r"CPU@(\\d+(?:\\.\\d+)?)C", line)
|
||||||
|
if m:
|
||||||
|
updates["cpu_temp_c"] = float(m.group(1))
|
||||||
|
m = re.search(r"RAM\\s+(\\d+)/(\\d+)MB", line)
|
||||||
|
if m:
|
||||||
|
updates["ram_used_mb"] = float(m.group(1))
|
||||||
|
updates["ram_total_mb"] = float(m.group(2))
|
||||||
|
m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line)
|
||||||
|
if m:
|
||||||
|
updates["power_5v_in_mw"] = float(m.group(1))
|
||||||
|
with LOCK:
|
||||||
|
METRICS.update(updates)
|
||||||
|
METRICS["last_scrape_ts"] = time()
|
||||||
|
|
||||||
|
def run_tegrastats():
|
||||||
|
proc = subprocess.Popen(
|
||||||
|
["/host/usr/bin/tegrastats", "--interval", "1000"],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
text=True,
|
||||||
|
bufsize=1,
|
||||||
|
)
|
||||||
|
for line in proc.stdout:
|
||||||
|
parse_line(line)
|
||||||
|
|
||||||
|
class Handler(http.server.BaseHTTPRequestHandler):
|
||||||
|
def do_GET(self):
|
||||||
|
if self.path != "/metrics":
|
||||||
|
self.send_response(404)
|
||||||
|
self.end_headers()
|
||||||
|
return
|
||||||
|
with LOCK:
|
||||||
|
metrics = METRICS.copy()
|
||||||
|
out = []
|
||||||
|
for k, v in metrics.items():
|
||||||
|
out.append(f"# TYPE jetson_{k} gauge")
|
||||||
|
out.append(f"jetson_{k} {v}")
|
||||||
|
body = "\\n".join(out) + "\\n"
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-Type", "text/plain; version=0.0.4")
|
||||||
|
self.send_header("Content-Length", str(len(body)))
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(body.encode("utf-8"))
|
||||||
|
|
||||||
|
def log_message(self, fmt, *args):
|
||||||
|
return
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
t = threading.Thread(target=run_tegrastats, daemon=True)
|
||||||
|
t.start()
|
||||||
|
with socketserver.TCPServer(("", PORT), Handler) as httpd:
|
||||||
|
httpd.serve_forever()
|
||||||
@ -14,6 +14,7 @@ resources:
|
|||||||
- grafana-dashboard-gpu.yaml
|
- grafana-dashboard-gpu.yaml
|
||||||
- grafana-dashboard-mail.yaml
|
- grafana-dashboard-mail.yaml
|
||||||
- dcgm-exporter.yaml
|
- dcgm-exporter.yaml
|
||||||
|
- jetson-tegrastats-exporter.yaml
|
||||||
- postmark-exporter-service.yaml
|
- postmark-exporter-service.yaml
|
||||||
- postmark-exporter-deployment.yaml
|
- postmark-exporter-deployment.yaml
|
||||||
- grafana-alerting-config.yaml
|
- grafana-alerting-config.yaml
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user