monitoring: add glue row and fix mail dns

This commit is contained in:
Brad Stein 2026-01-18 08:12:06 -03:00
parent 4874ccda4d
commit d7812623cd
6 changed files with 526 additions and 165 deletions

View File

@ -24,7 +24,7 @@ data:
192.168.22.9 live.bstein.dev
192.168.22.9 logs.bstein.dev
192.168.22.9 longhorn.bstein.dev
192.168.22.9 mail.bstein.dev
192.168.22.4 mail.bstein.dev
192.168.22.9 matrix.live.bstein.dev
192.168.22.9 metrics.bstein.dev
192.168.22.9 monero.bstein.dev

View File

@ -332,6 +332,8 @@ GLUE_MISSING = f"({GLUE_JOBS} unless {GLUE_LAST_SUCCESS})"
GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))"
GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})"
GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})"
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
GPU_NODE_REGEX = "|".join(GPU_NODES)
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
@ -1054,17 +1056,6 @@ def build_overview():
links=link_to("atlas-mail"),
)
)
panels.append(
stat_panel(
34,
"Glue Jobs Stale",
GLUE_STALE_COUNT,
{"h": 2, "w": 4, "x": 20, "y": 8},
unit="none",
thresholds=count_thresholds,
links=link_to("atlas-glue"),
)
)
storage_panels = [
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
@ -1120,6 +1111,50 @@ def build_overview():
)
)
panels.append(
{
"id": 34,
"type": "row",
"title": "Glue + Automation",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 25},
"collapsed": False,
"panels": [],
}
)
panels.append(
stat_panel(
35,
"Glue Jobs Stale",
GLUE_STALE_COUNT,
{"h": 6, "w": 8, "x": 0, "y": 26},
unit="none",
thresholds=count_thresholds,
links=link_to("atlas-glue"),
)
)
panels.append(
stat_panel(
36,
"Glue Jobs Missing Success",
GLUE_MISSING_COUNT,
{"h": 6, "w": 8, "x": 8, "y": 26},
unit="none",
thresholds=count_thresholds,
links=link_to("atlas-glue"),
)
)
panels.append(
stat_panel(
37,
"Glue Jobs Suspended",
GLUE_SUSPENDED_COUNT,
{"h": 6, "w": 8, "x": 16, "y": 26},
unit="none",
thresholds=count_thresholds,
links=link_to("atlas-glue"),
)
)
worker_filter = f"{WORKER_REGEX}"
panels.append(
timeseries_panel(
@ -2186,7 +2221,7 @@ def build_glue_dashboard():
table_panel(
2,
"Glue Jobs Missing Success",
GLUE_MISSING,
GLUE_MISSING_ACTIVE,
{"h": 4, "w": 6, "x": 6, "y": 0},
unit="none",
transformations=sort_desc,

View File

@ -88,7 +88,7 @@
},
"targets": [
{
"expr": "(kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})",
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)",
"refId": "A",
"instant": true
}

View File

@ -1110,81 +1110,6 @@
}
]
},
{
"id": 34,
"type": "stat",
"title": "Glue Jobs Stale",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 4,
"x": 20,
"y": 8
},
"targets": [
{
"expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-glue dashboard",
"url": "/d/atlas-glue",
"targetBlank": true
}
]
},
{
"id": 23,
"type": "stat",
@ -1676,6 +1601,244 @@
],
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
},
{
"id": 34,
"type": "row",
"title": "Glue + Automation",
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 25
},
"collapsed": false,
"panels": []
},
{
"id": 35,
"type": "stat",
"title": "Glue Jobs Stale",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"y": 26
},
"targets": [
{
"expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-glue dashboard",
"url": "/d/atlas-glue",
"targetBlank": true
}
]
},
{
"id": 36,
"type": "stat",
"title": "Glue Jobs Missing Success",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"y": 26
},
"targets": [
{
"expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-glue dashboard",
"url": "/d/atlas-glue",
"targetBlank": true
}
]
},
{
"id": 37,
"type": "stat",
"title": "Glue Jobs Suspended",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 16,
"y": 26
},
"targets": [
{
"expr": "sum(kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-glue dashboard",
"url": "/d/atlas-glue",
"targetBlank": true
}
]
},
{
"id": 14,
"type": "timeseries",

View File

@ -97,7 +97,7 @@ data:
},
"targets": [
{
"expr": "(kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})",
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)",
"refId": "A",
"instant": true
}

View File

@ -1119,81 +1119,6 @@ data:
}
]
},
{
"id": 34,
"type": "stat",
"title": "Glue Jobs Stale",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 4,
"x": 20,
"y": 8
},
"targets": [
{
"expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-glue dashboard",
"url": "/d/atlas-glue",
"targetBlank": true
}
]
},
{
"id": 23,
"type": "stat",
@ -1685,6 +1610,244 @@ data:
],
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
},
{
"id": 34,
"type": "row",
"title": "Glue + Automation",
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 25
},
"collapsed": false,
"panels": []
},
{
"id": 35,
"type": "stat",
"title": "Glue Jobs Stale",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"y": 26
},
"targets": [
{
"expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-glue dashboard",
"url": "/d/atlas-glue",
"targetBlank": true
}
]
},
{
"id": 36,
"type": "stat",
"title": "Glue Jobs Missing Success",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"y": 26
},
"targets": [
{
"expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-glue dashboard",
"url": "/d/atlas-glue",
"targetBlank": true
}
]
},
{
"id": 37,
"type": "stat",
"title": "Glue Jobs Suspended",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 16,
"y": 26
},
"targets": [
{
"expr": "sum(kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-glue dashboard",
"url": "/d/atlas-glue",
"targetBlank": true
}
]
},
{
"id": 14,
"type": "timeseries",